U
    sVc:                     @  sN  d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZ	ddl
mZmZmZmZ ddlmZ ddlmZ ddlZdd	lmZ dd
lmZ dZdZdZdZddddddddddddddddgZdZd Zd!Z d"Z!d#e d$e  d$e d$e! d%	Z"d&e d$e d'Z#d(Z$d)d*d+d,d-Z%d)d.d/d0Z&d1d2 Z'd3d4 Z(G d5d6 d6eej)Z*dS )7a-  
Read a SAS XPort format file into a Pandas DataFrame.

Based on code from Jack Cushman (github.com/jcushman/xport).

The file format is defined here:

https://support.sas.com/content/dam/SAS/support/en/technical-papers/record-layout-of-a-sas-version-5-or-6-data-set-in-sas-transport-xport-format.pdf
    )annotations)abc)datetimeN)CompressionOptionsDatetimeNaTTypeFilePath
ReadBuffer)Appender)find_stack_level)
get_handle)
ReaderBasezPHEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000  zKHEADER RECORD*******MEMBER  HEADER RECORD!!!!!!!000000000000000001600000000zPHEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000  zPHEADER RECORD*******OBS     HEADER RECORD!!!!!!!000000000000000000000000000000  ntypeZnhfunfield_lengthZnvar0namelabelZnformZnflZnum_decimalsZnfjZnfillZniformZniflZnifdZnpos_zParameters
----------
filepath_or_buffer : str or file-like object
    Path to SAS file or object implementing binary read method.zindex : identifier of index column
    Identifier of column that should be used as index of the DataFrame.
encoding : str
    Encoding for text data.
chunksize : int
    Read file `chunksize` lines at a time, returns iterator.zBformat : str
    File format, only `xport` is currently supported.z\iterator : bool, default False
    Return XportReader object for reading file incrementally.z#Read a SAS file into a DataFrame.


a  

Returns
-------
DataFrame or XportReader

Examples
--------
Read a SAS Xport file:

>>> df = pd.read_sas('filename.XPT')

Read a Xport file in 10,000 line chunks:

>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
>>> for chunk in itr:
>>>     do_something(chunk)

z$Class for reading SAS Xport files.

z

Attributes
----------
member_info : list
    Contains information about the file
fields : list
    Contains information about the variables in the file
zRead observations from SAS Xport file, returning as data frame.

Parameters
----------
nrows : int
    Number of rows to read from data file; if None, read whole
    file.

Returns
-------
A DataFrame.
strr   )datestrreturnc                 C  s.   zt | dW S  tk
r(   tj Y S X dS )z1Given a date in xport format, return Python date.z%d%b%y:%H:%M:%SN)r   strptime
ValueErrorpdZNaT)r    r   ;/tmp/pip-unpacked-wheel-xj8nt62q/pandas/io/sas/sas_xport.py_parse_date   s    r   )sc                 C  s@   i }d}|D ](\}}| |||    ||< ||7 }q|d= |S )a  
    Parameters
    ----------
    s: str
        Fixed-length string to split
    parts: list of (name, length) pairs
        Used to break up string, name '_' will be filtered from output.

    Returns
    -------
    Dict of name:contents of string at given location.
    r   r   )strip)r   partsoutstartr   lengthr   r   r   _split_line   s    
r"   c                 C  sT   |dkrPt t| t d}t d| dd|  }|j|d}| |d< |S | S )N   ZS8Sz,Sdtypef0)npzeroslenr&   view)vecnbytesvec1r&   Zvec2r   r   r   _handle_truncated_float_vec   s    	r/   c           	      C  s  t d}| j|d}|d }|d }|d@ }t jt| t jd}d|t |d@ < d|t |d	@ < d
|t |d@ < ||L }||? |d@ dd
|  > B }|dM }||d? d@ d d> | d d> |d@ B O }t jt|fdd}||d< ||d< |jdd}|d}|S )zf
    Parse a vector of float values representing IBM 8 byte floats into
    native 8 byte floats.
    z>u4,>u4r%   r'   f1i    i       i  @    i         l          A   i     l        z>f8Zf8)	r(   r&   r+   r)   r*   Zuint8whereemptyZastype)	r,   r&   r.   Zxport1Zxport2Zieee1shiftZieee2Zieeer   r   r   _parse_float_vec   s*    
		 
r=   c                   @  s   e Zd ZeZd dddddd	d
ZddddZdd Zdd ZddddZ	ddddZ
d!ddddZdd Zeed"dddddZdS )#XportReaderN
ISO-8859-1inferzFilePath | ReadBuffer[bytes]z
str | Noner   None)filepath_or_bufferencodingcompressionr   c                 C  sf   || _ d| _|| _|| _t|d|d|d| _| jj| _z|   W n t	k
r`   | 
   Y nX d S )Nr   rbF)rC   Zis_textrD   )	_encoding_lines_read_index
_chunksizer   handleshandlerB   _read_header	Exceptionclose)selfrB   indexrC   	chunksizerD   r   r   r   __init__   s"    	
zXportReader.__init__)r   c                 C  s   | j   d S )N)rJ   rN   rO   r   r   r   rN     s    zXportReader.closec                 C  s   | j d S )NP   )rB   readdecoderS   r   r   r   _get_row  s    zXportReader._get_rowc              	   C  sB  | j d |  }|tkr4d|kr,tdtd|  }ddgddgd	dgd
dgddgg}t||}|d dkr|tdt|d |d< || _|  }t|d d |d< |  }|  }|t	}|t
k}	|r|	stdt|dd }
ddgddgddgddgd	dgd
dgddgg}t|  |}ddgd
dgddgddgg}|t|  | t|d |d< t|d |d< || _ddd}t|  dd }|
| }|d r|d|d  7 }| j |}g }d}t||
kr|d |
 ||
d   }}|d}td|}ttt|}|d
= ||d   |d < |d! }|d  dkrv|d"k sb|dkrvd#| d$}t|| D ]2\}}z| ||< W n tk
r   Y nX q~||d! 7 }||g7 }q|  }|tkstd%|| _|| _| j  | _|  | _ d&d' | jD | _!d(d' t"| jD }t#$|}|| _%d S ))Nr   z**COMPRESSED**z<Header record indicates a CPORT file, which is not readable.z#Header record is not an XPORT file.prefixr6   versionr#   ZOSr   created   zSAS     SAS     SASLIBz!Header record has invalid prefix.modifiedzMember header not foundset_nameZsasdatar   (   typenumericchar)r1   r2   6   :   rT      z>hhhh8s40s8shhh2s8shhl52sr   r   r2   zFloating field width z is not between 2 and 8.zObservation header not found.c                 S  s   g | ]}|d    qS )r   )rV   ).0xr   r   r   
<listcomp>  s     z,XportReader._read_header.<locals>.<listcomp>c                 S  s,   g | ]$\}}d t | dt |d  fqS )r   r$   r   )r   )rg   ifieldr   r   r   ri     s   )&rB   seekrW   _correct_line1r   r"   r   	file_info
startswith_correct_header1_correct_header2intupdatemember_inforU   r*   ljuststructunpackdictzip
_fieldkeys	TypeErroritemsr   AttributeError_correct_obs_headerfieldsrecord_lengthtellrecord_start_record_countnobscolumns	enumerater(   r&   _dtype)rO   Zline1Zline2Zfifrn   Zline3Zheader1Zheader2Z	headflag1Z	headflag2ZfieldnamelengthZmemrt   typesZ
fieldcountZ
datalengthZ	fielddatar   Z
obs_lengthZ
fieldbytesZfieldstructrk   flmsgkvheaderZdtypelr&   r   r   r   rL   "  s    "

	




"



zXportReader._read_headerzpd.DataFramec                 C  s   | j | jpddS )Nr1   nrows)rU   rI   rS   r   r   r   __next__  s    zXportReader.__next__rr   c                 C  s   | j dd | j  | j }|d dkr:tjdt d | jdkr\| j | j || j S | j dd | j d}t	j
|t	jd}t	|dk}t|dkrd}nd	t| }| j | j || | j S )
z
        Get number of records in file.

        This is maybe suboptimal because we have to seek to the end of
        the file.

        Side effect: returns file position to record_start.
        r   r2   rT   zxport file may be corrupted.)
stacklevelir%   l     @@  r#   )rB   rl   r   r   warningswarnr
   r   rU   r(   
frombufferZuint64Zflatnonzeror*   )rO   Ztotal_records_lengthZlast_card_bytesZ	last_cardixZtail_padr   r   r   r     s&    	

zXportReader._record_countc                 C  s   |dkr| j }| j|dS )a  
        Reads lines from Xport file and returns as dataframe

        Parameters
        ----------
        size : int, defaults to None
            Number of lines to read.  If None, reads whole file.

        Returns
        -------
        DataFrame
        Nr   )rI   rU   )rO   sizer   r   r   	get_chunk  s    zXportReader.get_chunkc                 C  sl   |j dd}|d dk|d dk@ |d dk@ }|d dk|d d	k@ |d d
kB |d dkB }||M }|S )Nzu1,u1,u2,u4r%   r0   r   f2Zf3r'   r8   Z   _   .   )r+   )rO   r,   r   missZmiss1r   r   r   _missing_double  s    $

zXportReader._missing_doublez
int | None)r   r   c                   sf  |d kr j }t| j  j }| j }|dkr>   t j|}tj	| j
|d}tjt|d}t jD ]\}}|dt|  }	 j| d }
|
dkrt|	 j| d }	 |	}t|	}tj||< n@ j| d dkrd	d
 |	D } jd k	r fdd
|D }|||< qv jd krHtt j j| |_n| j}  j|7  _|S )Nr   )r&   count)rP   r   r   rb   r   rc   c                 S  s   g | ]}|  qS r   )rstriprg   yr   r   r   ri     s     z$XportReader.read.<locals>.<listcomp>c                   s   g | ]}|  jqS r   )rV   rF   r   rS   r   r   ri     s     )r   minrG   r   rN   StopIterationrB   rU   r(   r   r   r   Z	DataFrameranger   r   r   r   r/   r   r=   nanrF   rH   ZIndexrP   Z	set_index)rO   r   
read_linesread_lenrawdataZdfjrh   r,   r   r   r   r   rS   r   rU     s8    


zXportReader.read)Nr?   Nr@   )N)N)__name__
__module____qualname___xport_reader_doc__doc__rR   rN   rW   rL   r   r   r   r   r	   _read_method_docrU   r   r   r   r   r>      s       n&r>   )+r   
__future__r   collectionsr   r   rv   r   Znumpyr(   Zpandas._typingr   r   r   r   Zpandas.util._decoratorsr	   Zpandas.util._exceptionsr
   Zpandasr   Zpandas.io.commonr   Zpandas.io.sas.sasreaderr   rm   rp   rq   r~   rz   Z_base_params_docZ_params2_docZ_format_params_docZ_iterator_docZ_read_sas_docr   r   r   r"   r/   r=   Iteratorr>   r   r   r   r   <module>   sz   		9