Source code for sciris.sc_dataframe

"""
Extension of the pandas dataframe to be more flexible, especially with filtering
rows/columns and concatenating data.
"""

##############################################################################
#%% Dataframe
##############################################################################

import numbers # For numeric type
import numpy as np
import pandas as pd
import warnings
import sciris as sc

# Pandas version detection for compatibility
_pandas_version = tuple(int(x) for x in pd.__version__.split('.')[:2])
_pandas_ge_3 = _pandas_version >= (3, 0)

__all__ = ['dataframe']



[docs]
class dataframe(pd.DataFrame):
    """
    An extension of the pandas :class:`DataFrame <pandas.DataFrame>` with additional convenience methods for
    accessing rows and columns and performing other operations, such as adding rows.

    Args:
        data (dict/array/dataframe): the data to use; passed to :class:`pd.DataFrame() <pandas.DataFrame>`
        index (array): the index to use; passed to :class:`pd.DataFrame() <pandas.DataFrame>`
        columns (list): column labels (if a dict is supplied, the value sets the dtype)
        dtype (type): a dtype for the whole dataframe; passed to :class:`pd.DataFrame() <pandas.DataFrame>`
        copy (bool): whether to copy the data (ignored in pandas ≥ 3.0.0 due to Copy-on-Write behavior)
        dtypes (list/dict): alternatively, list of data types to set each column to
        nrows (int): the number of arrows to preallocate (default 0)
        kwargs (dict): if provided, treat these as data columns

    *Hint*: Run the example below line by line to get a sense of how the dataframe
    changes.

    **Examples**::

        df = sc.dataframe(cols=['x','y'], data=[[1238,2],[384,5],[666,7]]) # Create data frame
        df['x'] # Print out a column
        df[0] # Print out a row
        df['x',0] # Print out an element
        df[0,:] = [123,6]; print(df) # Set values for a whole row
        df['y'] = [8,5,0]; print(df) # Set values for a whole column
        df['z'] = [14,14,14]; print(df) # Add new column
        df.rmcol('z'); print(df) # Remove a column
        df.addcol('z', [14,14,14]); print(df) # Alternate way to add new column
        df.poprow(1); print(df) # Remove a row
        df.append([555,2,14]); print(df) # Append a new row
        df.insertrow(1,[556,2,14]); print(df) # Insert a new row
        df.sort(); print(df) # Sort by the first column
        df.sort('y'); print(df) # Sort by the second column
        df.findrow(123) # Return the row starting with value 123
        df.rmrow(); print(df) # Remove last row
        df.rmrow(555); print(df) # Remove the row starting with element '555'

        # Direct setting of data
        df = sc.dataframe(a=[1,2,3], b=[4,5,6])

    The dataframe can be used for both numeric and non-numeric data.

    | *New in version 2.0.0:* subclass pandas DataFrame
    | *New in version 3.0.0:* "dtypes" argument; handling of item setting
    | *New in version 3.1.0:* use panda's equality operator by default (unless an exception is raised); new "equal" method; "cat" can be an instance method now
    | *New in version 3.2.5:* pandas 3.0.0 compatibility
    """

    def __init__(self, data=None, index=None, columns=None, dtype=None, copy=None,
                 dtypes=None, nrows=None, **kwargs):

        # Handle inputs
        if 'cols' in kwargs:
            if columns is not None: # pragma: no cover
                raise ValueError('The argument "cols" is an alias for "columns", do not supply both')
            columns = kwargs.pop('cols')
        if nrows and data is None:
            ncols = len(columns)
            data = np.zeros((nrows, ncols))

        # Handle columns and dtypes
        if isinstance(columns, dict):
            if dtypes is not None:
                errormsg = 'You can supply dtypes as a separate argument or as part of a columns dict, but not both'
                raise ValueError(errormsg)
            dtypes = columns # Already in the right format
            columns = list(columns.keys())

        # Handle data
        if kwargs:
            if data is None:
                data = kwargs
            elif isinstance(data, dict):
                data.update(kwargs)
            else:
                errormsg = f'When providing data columns via keywords ("{sc.strjoin(kwargs.keys())}"), these can only be combined with a dict, not an object of {type(data)}. Pass the data as a dict instead.'
                raise TypeError(errormsg)

        # Check data and column compatibility
        if isinstance(data, dict) and columns is not None:
            colset = set(columns)
            dataset = set(data.keys())
            match = colset & dataset
            if not len(match):
                warnmsg = 'No overlap between column names and data keys, are you sure you want to do this?'
                warnmsg += f'\nColumns: {colset}\nData keys: {dataset}'
                warnings.warn(warnmsg, category=RuntimeWarning, stacklevel=2)

        # Create the dataframe
        # Note: In pandas ≥ 3.0.0, the 'copy' parameter is deprecated due to Copy-on-Write behavior
        if _pandas_ge_3:
            super().__init__(data=data, index=index, columns=columns, dtype=dtype)
        else:
            super().__init__(data=data, index=index, columns=columns, dtype=dtype, copy=copy)

        # Optionally set dtypes
        if dtypes is not None:
            self.set_dtypes(dtypes)

        return


    @property
    def cols(self):
        """ Get columns as a list """
        return self.columns.tolist()



[docs]
    def set_dtypes(self, dtypes):
        """
        Set dtypes in-place (see :meth:`df.astype() <pandas.DataFrame.astype>` for the user-facing version)

        *New in version 3.0.0.*
        """
        if not isinstance(dtypes, dict):
            dtypes = {col:dtype for col,dtype in zip(self.columns, dtypes)}
        for col,dtype in dtypes.items(): # NB: "self.astype(dtypes)" does not modify in place (Copy-on-Write in pandas ≥ 3.0.0)
            self[col] = self[col].astype(dtype)
        return




[docs]
    def col_index(self, col=None, *args, die=True):
        """
        Get the index of the column named ``col``.

        Similar to ``df.columns.get_loc(col)``, and opposite of :meth:`df.col_name <dataframe.col_name>`.

        Args:
            col (str/list): the column(s) to get the index of (return 0 if None)
            args (list): additional column(s) to get the index of
            die (bool): whether to raise an exception if the column could not be found (else, return None)

        **Examples**::

            df = sc.dataframe(dict(a=[1,2,3], b=[4,5,6], c=[7,8,9]))
            df.col_index('b') # Returns 1
            df.col_index(1) # Returns 1
            df.col_index('a', 'c') # Returns [0, 2]

        *New in version 3.0.0:* renamed from "_sanitizecols"; multiple arguments
        """
        arglist = sc.mergelists(col, list(args), keepnone=True)
        outputlist = []
        cols = self.cols
        for col in arglist:
            if col is None:
                output = 0 # If not supplied, assume first column is intended
            elif col in cols:
                output = cols.index(col) # Convert to index
            elif sc.isnumber(col):
                try:
                    cols[col]
                except IndexError as E: # pragma: no cover
                    errormsg = f'Column "{col}" is not a valid index; there are {len(cols)} columns'
                    raise IndexError(errormsg) from E
                output = col
            else: # pragma: no cover
                errormsg = f'Unrecognized column/column type "{col}" {type(col)}'
                if die:
                    raise TypeError(errormsg)
                else:
                    print(errormsg)
                    output = None
            outputlist.append(output)
        if len(outputlist) == 1:
            outputlist = outputlist[0]
        return outputlist




[docs]
    def col_name(self, col=None, *args, die=True):
        """
        Get the name of the column(s) with index ``col``.

        Similar to ``df.columns[col]``, and opposite of :meth:`df.col_index <dataframe.col_index>`.

        **Note**: This method always looks for named columns first. If ``col`` is
        name of a column, it will return ``col`` rather than ``columns[col]``. See
        example below for more information.

        Args:
            col (int/list): the column(s) to get the index of (return 0 if None)
            args (list): additional column(s) to get the index of
            die (bool): whether to raise an exception if the column could not be found (else, return None)

        **Examples**::

            df = sc.dataframe(dict(a=[1,2,3], b=[4,5,6], c=[7,8,9]))
            df.col_name(1) # Returns 'b'
            df.col_name('b') # Returns 'b'
            df.col_name(0, 2) # Returns ['a', 'c']

        *New in version 3.0.0.*
        """
        arglist = sc.mergelists(col, list(args), keepnone=True)
        outputlist = []
        cols = self.cols
        for col in arglist:
            if col is None:
                col = 0 # If not supplied, assume first column is intended
            elif col in cols:
                output = col # It's already a column
            elif sc.isnumber(col):
                try:
                    output = cols[col]
                except Exception as E: # pragma: no cover
                    errormsg = f'Column "{col}" is not a valid index'
                    raise IndexError(errormsg) from E
            else: # pragma: no cover
                errormsg = f'Unrecognized column/column type "{col}" {type(col)}'
                if die:
                    raise TypeError(errormsg)
                else:
                    print(errormsg)
                    output = None
            outputlist.append(output)
        if len(outputlist) == 1:
            outputlist = outputlist[0]
        return outputlist




[docs]
    def get(self, key):
        """ Alias to pandas __getitem__ method; rarely used """
        return super().__getitem__(key)




[docs]
    def set(self, key, value=None):
        """ Alias to pandas __setitem__ method; rarely used """
        return super().__setitem__(key, value)



    def __getitem__(self, key=None, die=True, cast=True):
        """ Simple method for returning; see self.flexget() for a version based on col and row """
        try: # Default to the pandas version
            output = super().__getitem__(key)
        except: # ...but handle a wider variety of keys
            try:
                output = super().iloc[key]
            except:
                if sc.isstring(key): # e.g. df['a'] -- usually handled by pandas # pragma: no cover
                    rowindex = slice(None)
                    try:
                        colindex = self.cols.index(key)
                    except ValueError:
                        errormsg = f'Key "{key}" is not a valid column; choices are: {sc.strjoin(self.cols)}'
                        raise sc.KeyNotFoundError(errormsg)
                elif isinstance(key, (numbers.Number, list, np.ndarray, slice)): # e.g. df[0], df[[0,2]], df[:4]
                    rowindex = key
                    colindex = slice(None)
                elif isinstance(key, tuple):
                    rowindex = key[0]
                    colindex = key[1]
                    if sc.isstring(rowindex) and not sc.isstring(colindex): # Swap order if one's a string and the other isn't
                        rowindex, colindex = colindex, rowindex
                    if sc.isstring(colindex): # e.g. df['a',0]
                        colindex = self.cols.index(colindex)
                else: # pragma: no cover
                    errormsg = f'Unrecognized dataframe key of {type(key)}: must be str, numeric, or tuple'
                    if die:
                        raise sc.KeyNotFoundError(errormsg)
                    else:
                        print(errormsg)
                        output = None
                output = self.iloc[rowindex,colindex]

        return output


    def __setitem__(self, key, value=None):
        try:
            # Use regular pandas for everything except keys that look like (0,'a'), ('a',0), (0,0), or (0,:)
            if isinstance(key, tuple) and (key not in self.columns) and (len(key) == 2) and all([isinstance(k, (int, str, Ellipsis)) for k in key]):
                raise NotImplementedError # Break out of the loop
            super().__setitem__(key, value)
        except Exception as E1:
            cols = self.cols
            try:
                rowindex = key[0]
                colindex = key[1]
                if rowindex in cols and colindex not in cols: # Swap order if one's a string and the other isn't
                    rowindex, colindex = colindex, rowindex
                if colindex in cols: # e.g. df['a',0]
                    colindex = cols.index(colindex)
                self.iloc[rowindex, colindex] = value
            except Exception as E2: # pragma: no cover
                if isinstance(E1, NotImplementedError): # We tried to raise it, so only care about the second one
                    mainerr = E2
                    errstr = f'\n{E2}'
                else: # An actual pandas error, raise both
                    mainerr = E1
                    errstr = f'\n{E1}\n{E2}'
                exc = type(mainerr)
                errormsg = f'Could not understand key {key}:{errstr}'
                raise exc(errormsg) from mainerr
        return



[docs]
    def flexget(self, cols=None, rows=None, asarray=False, cast=True, default=None):
        """
        More complicated way of getting data from a dataframe. While getting directly
        by key usually returns the array data directly, this usually returns another
        dataframe.

        Args:
            cols (str/list): the column(s) to get
            rows (int/list): the row(s) to get
            asarray (bool): whether to return an array (otherwise, return a dataframe)
            cast (bool): attempt to cast to an all-numeric array
            default (any): the value to return if the column(s)/row(s) can't be found

        **Example**::

            df = sc.dataframe(cols=['x','y','z'],data=[[1238,2,-1],[384,5,-2],[666,7,-3]]) # Create data frame
            df.flexget(cols=['x','z'], rows=[0,2])
        """
        if cols is None: # pragma: no cover
            colindices = Ellipsis
        else:
            colindices = []
            for col in sc.tolist(cols):
                colindices.append(self.col_index(col))
        if rows is None: # pragma: no cover
            rowindices = Ellipsis
        else:
            rowindices = rows

        output = self.iloc[rowindices,colindices] # Split up so can handle non-consecutive entries in either
        if output.size == 1:
            output = np.array(output).flatten()[0] # If it's a single element, return the value rather than the array
        elif asarray:
            output = np.array(output)
        else:
            output = self._constructor(data=output, columns=np.array(self.cols)[colindices].tolist())

        return output



    def __eq__(self, other):
        """
        Try the default ``__eq__()``, but fall back on the more robust ``equal()``

        *New in version 3.0.0.*
        """
        try: # First try default comparison
            return super().__eq__(other)
        except: # Otherwise, use the custom check # pragma: no cover
            return self.equals(other)


[docs]
    @classmethod
    def equal(cls, *args, equal_nan=True):
        """
        Class method returning boolean true/false equals that allows for more robust equality checks:
        same type, size, columns, and values. See :meth:`df.equals() <dataframe.equals>` for
        equivalent instance method.

        **Examples**::

            df1 = sc.dataframe(a=[1, 2, np.nan])
            df2 = sc.dataframe(a=[1, 2, 4])

            sc.dataframe.equal(df1, df1) # Returns True
            sc.dataframe.equal(df1, df1, equal_nan=False) # Returns False
            sc.dataframe.equal(df1, df2) # Returns False
            sc.dataframe.equal(df1, df1, df2) # Also returns False

        *New in version 3.1.0.*
        """
        if len(args) < 2: # pragma: no cover
            errormsg = f'There must be ≥2 input arguments, not {len(args)}'
            raise ValueError(errormsg)
        base = args[0]
        others = args[1:]
        eqs = []

        # Handle NaNs
        if equal_nan:
            base = base.fillna(sc.sc_math._nan_fill)

        for other in others:

            # Check type
            if not isinstance(other, base.__class__):
                eq = False

            # Check shape
            elif base.values.shape != other.values.shape:
                eq = False

            # Check columns
            elif not np.all(base.columns == other.columns):
                eq = False

            # Finally, check values
            else:
                if equal_nan:
                    other = other.fillna(sc.sc_math._nan_fill)
                eq = np.all(base.values == other.values)

            eqs.append(eq)

        all_eq = all(eqs)

        return all_eq




[docs]
    def equals(self, other, *args, equal_nan=True):
        """
        Try the default :meth:`equals() <pandas.DataFrame.equals>`, but fall back
        on the more robust :meth:`sc.dataframe.equal() <dataframe.equal>` if that
        fails.

        *New in version 3.1.0.*
        """
        try:
            assert equal_nan # Regular pandas equals can't handle non-equal NaNs
            assert len(args) == 0 # Regular pandas can't handle multiple arguments
            return super().equals(other)
        except: # Otherwise, do manual check
            return self.equal(self, other, *args, equal_nan=equal_nan)




[docs]
    def disp(self, nrows=None, ncols=None, width=999, precision=4, options=None, **kwargs):
        """
        Flexible display of a dataframe, showing all rows/columns by default.

        Args:
            nrows (int): maximum number of rows to show (default: all)
            ncols (int): maximum number of columns to show (default: all)
            width (int): maximum screen width (default: 999)
            precision (int): number of decimal places to show (default: 4)
            options (dict): an optional dictionary of additional options, passed to :class:`pd.option_context() <pandas.option_context>`
            kwargs (dict): also passed to :class:`pd.option_context() <pandas.option_context>`, with 'display.' preprended if needed

        **Examples**::

            df = sc.dataframe(data=np.random.rand(100,10))
            df.disp()
            df.disp(precision=1, ncols=5, colheader_justify='left')

        *New in version 2.0.1.*
        """
        kwdict = {}
        for k,v in kwargs.items():
            key = k
            if k in dir(pd.options.display):
                key = f'display.{k}'
            kwdict[key] = v
        opts = sc.mergedicts({
            'display.max_rows': nrows,
            'display.max_columns': ncols,
            'display.width': width,
            'display.precision': precision,
            },
            options,
            kwdict,
        )
        optslist = [item for pair in opts.items() for item in pair] # Convert from dict to list
        with pd.option_context(*optslist):
            print(self)
        return




[docs]
    def replacedata(self, newdata=None, newdf=None, reset_index=True, inplace=True):
        """
        Replace data in the dataframe with other data; usually not used directly
        by the user, but used as part of e.g. :meth:`df.concat() <dataframe.concat>`.

        Args:
            newdata (array): replace the dataframe's data with these data
            newdf (dataframe): substitute the current dataframe with this one
            reset_index (bool): update the index
            inplace (bool): whether to modify in-place

        *New in version 3.0.0:* improved dtype handling
        *New in version 3.2.5:* support deprecation of the verify_is_copy argument in Pandas 3.0
        """
        if newdf is None: # pragma: no cover
            newdf = self._constructor(data=newdata, columns=self.columns)
        if reset_index:
            newdf.reset_index(drop=True, inplace=True)
        if inplace:
            # Note: verify_is_copy parameter removed in pandas 3.0.0
            if _pandas_ge_3:
                self._update_inplace(newdf)
            else:
                self._update_inplace(newdf, verify_is_copy=False)
            return self
        else:
            return newdf




[docs]
    def appendrow(self, row, reset_index=True, inplace=True):
        """
        Add row(s) to the end of the dataframe.

        See also :meth:`df.concat() <dataframe.concat>` and :meth:`df.insertrow() <dataframe.insertrow>`. Similar to the pandas operation
        ``df.iloc[-1] = ...``, but faster and provides additional type checking.

        Args:
            value (array): the row(s) to append
            reset_index (bool): update the index
            inplace (bool): whether to modify in-place

        Note: "appendrow" and "concat" are equivalent, except appendrow() defaults
        to modifying in-place and "concat" defaults to returning a new dataframe.

        Warning: modifying dataframes in-place is quite inefficient. For highest
        performance, construct the data in large chunks and then add to the dataframe
        all at once, rather than adding row by row.

        **Example**::

            import sciris as sc
            import numpy as np

            df = sc.dataframe(dict(
                a = ['foo','bar'],
                b = [1,2],
                c = np.random.rand(2)
            ))
            df.appendrow(['cat', 3, 0.3])           # Append a list
            df.appendrow(dict(a='dog', b=4, c=0.7)) # Append a dict

        *New in version 3.0.0:* renamed "value" to "row"; improved performance
        """
        return self.concat(row, reset_index=reset_index, inplace=inplace)




[docs]
    def append(self, row, reset_index=True, inplace=True):
        """
        Alias to :meth:`appendrow() <dataframe.appendrow>`.

        **Note**: `pd.DataFrame.append` was deprecated in pandas version 2.0; see
        https://github.com/pandas-dev/pandas/issues/35407 for details. Since this
        method is implemented using :func:`pd.concat() <pandas.concat>`, it does
        not suffer from the performance problems that ``append`` did.

        *New in version 3.0.0.*
        """
        return self.concat(row, reset_index=reset_index, inplace=inplace)




[docs]
    def insertrow(self, index=0, value=None, reset_index=True, inplace=True, die=True, **kwargs):
        """
        Insert row(s) at the specified location. See also :meth:`df.concat() <dataframe.concat>`
        and :meth:`df.appendrow() <dataframe.appendrow>`.

        Args:
            index (int): index at which to insert new row(s)
            value (array): the row(s) to insert; can be an array, list, or dict
            reset_index (bool): update the index
            inplace (bool): whether to modify in-place
            die (bool): raise an exception if the length/columns of the inserted row do not match the existing dataframe
            kwargs (dict): passed to `:meth:`df.concat() <dataframe.concat>`

        Warning: modifying dataframes in-place is quite inefficient. For highest
        performance, construct the data in large chunks and then add to the dataframe
        all at once, rather than adding row by row.

        **Example**::

            import sciris as sc
            import numpy as np

            df = sc.dataframe(dict(
                a = ['foo','cat'],
                b = [1,3],
                c = np.random.rand(2)
            ))
            df.insertrow(1, ['bar', 2, 0.2])           # Insert a list
            df.insertrow(0, dict(a='rat', b=0, c=0.7)) # Insert a dict

        | *New in version 3.0.0:* renamed "row" to "index"
        | *New in version 3.2.3:* "die" argument
        """
        # Validation
        if die:
            if self.ncols>1:
                if not sc.isiterable(value):
                    errormsg = f'Must supply an iterable for the row, not {value}'
                    raise TypeError(errormsg)
                elif len(value) != self.ncols:
                    errormsg = f'Length mismatch: expecting {self.ncols}, but got {len(value)}'
                    raise ValueError(errormsg)
                elif isinstance(value, dict):
                    v_set = set(value.keys())
                    c_set = set(self.columns)
                    if v_set != c_set:
                        extra = v_set - c_set
                        missing = c_set - v_set
                        extrastr = f'\nExtra: {extra}' if extra else ''
                        missingstr = f'\nMissing: {missing}' if missing else ''
                        errormsg = f'Expecting columns:\n{self.columns}\nbut got:\n{value.keys()}' + extrastr + missingstr
                        raise ValueError(errormsg)

        # Perform insertion
        before = self.iloc[:index,:]
        after  = self.iloc[index:,:]
        newdf = self.cat(before, value, after, **kwargs)
        return self.replacedata(newdf=newdf, reset_index=reset_index, inplace=inplace)



    def _sanitize_df(self, arg, columns=None, **kwargs):
        """ Helper function to sanitize input into the correct format for constructing a new dataframe """
        if isinstance(arg, pd.DataFrame):
            df = arg
        else:
            if isinstance(arg, dict):
                columns = list(arg.keys())
                arg = list(arg.values())
            argarray = arg if isinstance(arg, np.ndarray) else np.array(arg) # Solely for checking the shape
            if argarray.shape == (self.ncols,): # If it's a single row with the right number of columns, make 2D
                arg = [arg]
            df = self._constructor(data=arg, columns=columns, **kwargs)
        return df



[docs]
    def concat(self, data, *args, columns=None, reset_index=True, inplace=False, dfargs=None, **kwargs):
        """
        Concatenate additional data onto the current dataframe.

        Similar to :meth:`df.appendrow() <dataframe.appendrow>` and :meth:`df.insertrow() <dataframe.insertrow>`;
        see also :meth:`sc.dataframe.cat() <dataframe.cat>` for the equivalent class method.

        Args:
            data (dataframe/array): the data to concatenate
            *args (dataframe/array): additional data to concatenate
            columns (list): if supplied, columns to go with the data
            reset_index (bool): update the index
            inplace (bool): whether to append in place
            dfargs (dict): arguments passed to construct each dataframe
            **kwargs (dict): passed to :func:`pd.concat() <pandas.concat>`

        **Example**::

            arr1 = np.random.rand(6,3)
            df2 = sc.dataframe(np.random.rand(4,3))
            df3 = df2.concat(arr1)

        | *New in version 2.0.2:* "inplace" defaults to False
        | *New in version 3.0.0:* improved type handling
        """
        dfargs = sc.mergedicts(dfargs)
        dfs = [self]
        if columns is None:
            columns = self.columns
        for arg in [data] + list(args):
            df = self._sanitize_df(arg, columns=columns, **dfargs)
            dfs.append(df)
        newdf = self._constructor(pd.concat(dfs, **kwargs), **dfargs)
        return self.replacedata(newdf=newdf, reset_index=reset_index, inplace=inplace)




[docs]
    @classmethod
    def cat(cls, data, *args, dfargs=None, **kwargs):
        """
        Convenience class method for concatenating multiple dataframes. See :meth:`df.concat() <dataframe.concat>`
        for the equivalent instance method.

        Args:
            data (dataframe/array): the dataframe/data to use as the basis of the new dataframe
            args (list): additional dataframes (or object that can be converted to dataframes) to concatenate
            dfargs (dict): arguments passed to construct each dataframe
            kwargs (dict): passed to :func:`df.concat() <dataframe.concat>`

        **Example**::

            arr1 = np.random.rand(6,3)
            df2 = pd.DataFrame(np.random.rand(4,3))
            df3 = sc.dataframe.cat(arr1, df2)

        *New in version 2.0.2.*
        """
        dfargs = sc.mergedicts(dfargs)
        df = cls(data, **dfargs)
        if len(args):
            df = df.concat(*args, dfargs=dfargs, **kwargs)
        return df




[docs]
    def merge(self, *args, reset_index=True, inplace=False, **kwargs):
        """
        Alias to :func:`pd.merge <pandas.merge>`, except merge in place.

        Args:
            reset_index (bool): update the index
            inplace (bool): whether to append in place
            **kwargs (dict): passed to :func:`pd.concat() <pandas.concat>`

        *New in version 3.0.0.*

        **Example**::

            df = sc.dataframe(dict(x=[1,2,3], y=[4,5,6]))
            df2 = sc.dataframe(dict(x=[1,2,3], z=[9,8,7]))
            df.merge(df2, on='x', inplace=True)
        """
        newdf = self._constructor(pd.merge(self, *args, **kwargs))
        return self.replacedata(newdf=newdf, reset_index=reset_index, inplace=inplace)



    @property
    def ncols(self):
        """ Get the number of columns in the dataframe """
        return len(self.columns)


    @property
    def nrows(self):
        """ Get the number of rows in the dataframe """
        return len(self)



[docs]
    def addcol(self, key=None, value=None, data=None, inplace=True, **kwargs):
        """
        Add new column(s) to the data frame

        See also :meth:`assign() <pandas.DataFrame.assign>`, which is similar, but
        returns a new dataframe by default.

        Args:
            key (str): the name of the column
            value (array): the values for the column
            data (dict): alternatively, specify a dictionary of columns to add
            inplace (bool): whether to return a new dataframe
            kwargs (dict): additional columns to add

        NB: a single argument is interpreted as "data"

        **Example**::

            df = sc.dataframe(dict(x=[1,2,3], y=[4,5,6]))
            new_cols = dict(z=[1,2,3], a=[9,8,7])
            df.addcol(new_cols)
        """
        # Parse into a data dict
        if isinstance(key, dict):
            data = key
            if value is not None:
                errormsg = 'If appending columns via dict, value cannot be specified'
                raise ValueError(errormsg)
        elif key is not None:
            data = {key:value}
        else:
            data = {}
        data.update(kwargs)

        # Do the update
        if inplace:
            for k,v in data.items():
                self.__setitem__(k, v)
            return
        else:
            return self.assign(**data)




[docs]
    def popcols(self, col=None, *args, die=True):
        """
        Remove a column or columns from the data frame.

        Alias to :meth:`pop() <pandas.DataFrame.pop>`, except allowing multiple
        columns to be popped.

        Args:
            col (str/list): the column(s) to be popped
            args (list): additional columns to pop
            die (bool): whether to raise an exception if a column is not found

        **Example**::

            df = sc.dataframe(cols=['a','b','c','d'], data=np.random.rand(3,4))
            df.popcols('a','c')
        """
        cols = sc.mergelists(col, list(args), keepnone=True)
        for col in cols:
            if col not in self.columns: # pragma: no cover
                errormsg = f'sc.dataframe(): cannot remove column {col}: columns are:\n{sc.newlinejoin(self.cols)}'
                if die: raise Exception(errormsg)
                else:   print(errormsg)
            else:
                self.pop(col)
        return self




[docs]
    def findind(self, value=None, col=None, closest=False, die=True):
        """
        Find the row index for a given value and column.

        See :meth:`df.findrow() <dataframe.findrow>` for the equivalent to return the row itself
        rather than the index of the row. See :meth:`df.col_index() <dataframe.col_index>` for the column
        equivalent.


        Args:
            value (any): the value to look for (default: return last row index)
            col (str): the column to look in (default: first)
            closest (bool): if true, return the closest match if an exact match is not found
            die (bool): whether to raise an exception if the value is not found (otherwise, return None)

        **Example**::

            df = sc.dataframe(data=[[2016,0.3],[2017,0.5]], columns=['year','val'])
            df.findind(2016) # returns 0
            df.findind(0.5, 'val') # returns 1
            df.findind(2013) # returns None, or exception if die is True
            df.findind(2013, closest=True) # returns 0

        *New in version 3.0.0:* renamed from "_rowindex"
        """
        col = self.col_index(col)
        coldata = self.iloc[:,col].values # Get data for this column
        if value is None: # pragma: no cover
            return len(coldata)-1 # If not supplied, pick the last element
        if closest: # pragma: no cover
            index = np.argmin(abs(coldata-value)) # Find the closest match to the key
        else:
            try:
                index = coldata.tolist().index(value) # Try to find duplicates
            except: # pragma: no cover
                if die:
                    errormsg = f'Item {value} not found; choices are: {coldata}'
                    raise IndexError(errormsg)
                else:
                    return
        return index



    def _diffinds(self, inds=None):
        """ For a given set of indices, get the inverse, in set-speak """
        if inds is None: inds = []
        all_inds = np.arange(self.nrows)
        these_inds = all_inds[inds]
        diff_set = np.setdiff1d(all_inds, these_inds)
        return diff_set



[docs]
    def poprow(self, row=-1, returnval=True):
        """
        Remove a row from the data frame.

        Alias to :meth:`drop <pandas.DataFrame.drop>`, except drop by position
        rather than label, and modify in-place. To pop multiple rows, see
        meth:`df.poprows() <dataframe.poprows>`.

        Args:
            row (int): index of the row to pop
            returnval (bool): whether to return the row that was popped

        To pop a column, see :meth:`df.pop() <pandas.DataFrame.pop>`.

        *New in version 3.0.0:* "key" argument renamed "row"
        """
        if isinstance(row, int):
            rowindex = row
            indexkey = self.index[row]
        else: # It's a string (most likely): find the corresponding index
            rowindex = self.index.get_indexer(row)
            indexkey = row
        if returnval:
            thisrow = self.iloc[rowindex,:]
        self.drop(indexkey, inplace=True)
        if returnval:
            return thisrow
        else:
            return self




[docs]
    def poprows(self, inds=-1, value=None, col=None, reset_index=True, inplace=True, **kwargs):
        """
        Remove multiple rows by index or value

        To pop a single row, see meth:`df.poprow() <dataframe.poprow>`.

        Args:
            inds (list): the rows to remove
            values (list): alternatively, search for these values to remove; see :meth:`df.findinds <dataframe.findinds>` for details
            col (str): if removing by value, use this column to find the values
            reset_index (bool): update the index
            inplace (bool): whether to modify in-place
            kwargs (dict): passed to :meth:`df.findinds <dataframe.findinds>`

        **Examples**::

            df = sc.dataframe(np.random.rand(10,3))
            df.poprows([3,4,5])

            df = sc.dataframe(dict(x=[0,1,2,3,4], y=[2,3,2,7,8]))
            df.poprows(value=2, col='y')
        """
        if value is not None:
            inds = self.findinds(value=value, col=col, **kwargs)
        keep_set = self._diffinds(inds)
        keep_data = self.iloc[keep_set,:]
        newdf = self._constructor(data=keep_data, cols=self.cols)
        return self.replacedata(newdf=newdf, reset_index=reset_index, inplace=inplace)




[docs]
    def enumrows(self, cols=None, type='objdict'):
        """
        Efficiently enumerate the rows of the dataframe

        Similar to :meth:`df.iterrows() <pandas.DataFrame.iterrows>`, but up to
        30x faster since uses tuples instead of ``pd.Series``.

        Args:
            cols (list): the list of columns to include in the enumeration (by default, all)
            type (str/type): the output type for each row: options are 'objdict' (default), tuple (fastest), list (very fast), dict (pretty fast)

        **Examples**::

            df = sc.dataframe(dict(x=[0,1,2,3,4], y=[2,3,2,7,8], z=[5,5,4,3,2]))
            for i,row in df.enumrows(): print(i, row.x+row.y) # Typical use case
            for i,row in df.enumrows(type=tuple): print(i, row[0]+row[1]) # Fastest
            for i,row in df.enumrows(type=dict): print(i, row['x']+row['y']) # Still fast
            for i,(x,y) in df.enumrows(cols=['x', 'y'], type=tuple): print(i, x+y) # Even faster
        """
        # Handle the columns
        if cols is None: cols = self.columns
        dftuple = (self[col] for col in cols)

        # Handle the output type
        type_map = {
            'tuple': tuple,
            'list': list,
            'dict': dict,
            'objdict': sc.objdict,
        }
        if isinstance(type, str) and type in type_map:
            func = type_map[type]
        elif callable(type):
            func = type
            type = sc.swapdict(type_map)[func]
        else:
            errormsg = f'Invalid input {type}: must be tuple, list, dict, or objdict'
            raise ValueError(errormsg)
        dictlike = type in ['dict', 'objdict']

        # Iterate
        if dictlike:
            for i,row in enumerate(zip(*dftuple)):
                row = func({k:v for k,v in zip(cols, row)})
                yield (i, row)
        else:
            for i,row in enumerate(zip(*dftuple)):
                row = func(row)
                yield (i, row)




[docs]
    def replacecol(self, col=None, old=None, new=None):
        """ Replace all of one value in a column with a new value """
        col = self.col_index(col)
        coldata = self.iloc[:,col] # Get data for this column
        inds = sc.findinds(arr=coldata, val=old)
        self.iloc[inds,col] = new
        return self




[docs]
    def to_odict(self, row=None):
        """
        Convert dataframe to a dict of columns, optionally specifying certain rows.

        Args:
            row (int/list): the rows to include
        """
        if row is None:
            row = slice(None)
        data = self.iloc[row,:].values
        datadict = {col:data[:,c] for c,col in enumerate(self.cols)}
        output = sc.odict(datadict)
        return output




[docs]
    def findrow(self, value=None, col=None, default=None, closest=False, asdict=False, die=False):
        """
        Return a row by searching for a matching value.

        See :meth:`df.findind() <dataframe.findind>` for the equivalent to return the index of the row
        rather than the row itself, and :meth:`df.findinds() <dataframe.findinds>`
        to find multiple row indices.

        Args:
            value (any): the value to look for
            col (str): the column to look for this value in
            default (any): the value to return if key is not found (overrides die)
            closest (bool): whether or not to return the closest row (overrides default and die)
            asdict (bool): whether to return results as dict rather than list
            die (bool): whether to raise an exception if the value is not found

        **Examples**::

            df = sc.dataframe(cols=['year','val'],data=[[2016,0.3],[2017,0.5], [2018, 0.3]])
            df.findrow(2016) # returns array([2016, 0.3], dtype=object)
            df.findrow(2013) # returns None, or exception if die is True
            df.findrow(2013, closest=True) # returns array([2016, 0.3], dtype=object)
            df.findrow(2016, asdict=True) # returns {'year':2016, 'val':0.3}
        """
        index = self.findind(value=value, col=col, die=(die and default is None), closest=closest)
        if index is not None:
            thisrow = self.iloc[index,:].values
            if asdict:
                thisrow = self.to_odict(thisrow)
        else:
            thisrow = default # If not found, return as default
        return thisrow




[docs]
    def findinds(self, value=None, col=None, **kwargs):
        """
        Return the indices of all rows matching the given key in a given column.

        Args:
            value (any): the value to look for
            col (str): the column to look in
            kwargs (dict): passed to :func:`sc.findinds() <sciris.sc_math.findinds>`

        **Example**::

            df = sc.dataframe(cols=['year','val'],data=[[2016,0.3],[2017,0.5], [2018, 0.3]])
            df.findinds(0.3, 'val') # Returns array([0,2])
        """
        col = self.col_index(col)
        coldata = self.iloc[:,col].values # Get data for this column
        inds = sc.findinds(arr=coldata, val=value, **kwargs)
        return inds



    def _filterrows(self, inds=None, value=None, col=None, keep=True, verbose=False, reset_index=True, inplace=False):
        """ Filter rows and either keep the ones matching, or discard them """
        if value is not None:
            inds = self.findinds(value=value, col=col)
        if keep: inds = self._diffinds(inds)
        if verbose: print(f'Dataframe filtering: {len(inds)} rows removed based on key="{inds}", column="{col}"')
        output = self.poprows(inds=inds, reset_index=reset_index, inplace=inplace)
        return output



[docs]
    def filterin(self, inds=None, value=None, col=None, verbose=False, reset_index=True, inplace=False):
        """
        Keep only rows matching a criterion; see also :meth:`df.filterout() <dataframe.filterout>`
        """
        return self._filterrows(inds=inds, value=value, col=col, keep=True, verbose=verbose, reset_index=reset_index, inplace=inplace)




[docs]
    def filterout(self, inds=None, value=None, col=None, verbose=False, reset_index=True, inplace=False):
        """
        Remove rows matching a criterion (in place); see also :meth:`df.filterin() <dataframe.filterin>`
        """
        return self._filterrows(inds=inds, value=value, col=col, keep=False, verbose=verbose, reset_index=reset_index, inplace=inplace)




[docs]
    def filtercols(self, cols=None, *args, keep=True, die=True, reset_index=True, inplace=False):
        """
        Filter columns keeping only those specified -- note, by default, do not perform in place

        Args:
            cols (str/list): the columns to keep (or remove if keep=False)
            args (list): additional columns
            keep (bool): whether to keep the named columns (else, remove them)
            die (bool): whether to raise an exception if a column is not found
            reset_index (bool): update the index
            inplace (bool): whether to modify in-place

        **Examples**::

            df = sc.dataframe(cols=['a','b','c','d'], data=np.random.rand(3,4))
            df2 = df.filtercols('a','b') # Keeps columns 'a' and 'b'
            df3 = df.filtercols('a','c', keep=False) # Keeps columns 'b' and 'd'
        """
        cols = sc.mergelists(cols, list(args), keepnone=True)
        order = []
        notfound = []
        for col in cols:
            try:
                order.append(self.cols.index(col))
            except ValueError: # pragma: no cover
                cols.remove(col)
                notfound.append(col)
        if len(notfound): # pragma: no cover
            errormsg = 'sc.dataframe(): could not find the following column(s): %s\nChoices are: %s' % (notfound, self.cols)
            if die: raise Exception(errormsg)
            else:   print(errormsg)
        if not keep: # pragma: no cover
            order = np.setdiff1d(np.arange(len(self.cols)), order)
            cols = [self.cols[o] for o in order]
        ordered_data = self.iloc[:,order] # Resort and filter the data
        newdf = self._constructor(cols=cols, data=ordered_data)
        return self.replacedata(newdf=newdf, reset_index=reset_index, inplace=inplace)




[docs]
    def sortrows(self, by=None, reverse=False, returninds=False, reset_index=True, inplace=True, **kwargs):
        """
        Sort the dataframe rows in place by the specified column(s).

        Similar to :meth:`df.sort_values() <pandas.DataFrame.sort_values>`, except defaults to sorting in place, and
        optionally returns the indices used for sorting (like :func:`np.argsort() <numpy.argsort>`).

        Args:
            col (str or int): column to sort by (default, first column)
            reverse (bool): whether to reverse the sort order (i.e., ascending=False)
            returninds (bool): whether to return the indices used to sort instead of the dataframe
            reset_index (bool): update the index
            inplace (bool): whether to modify the dataframe in-place
            kwargs (dict): passed to :meth:`df.sort_values() <pandas.DataFrame.sort_values>`

        *New in version 3.0.0:* "inplace" argument; "col" argument renamed "by"
        """
        by = kwargs.pop('col', by) # Handle deprecation
        ascending = kwargs.pop('ascending', not(reverse))
        if by is None:
            by = 0 # Sort by first column by default
        if isinstance(by, int):
            by = self.columns[by]
        if returninds:
            sortorder = np.argsort(self[by].values, kind='mergesort') # To preserve order
        df = self.sort_values(by=by, ascending=ascending, inplace=inplace, **kwargs)
        out = self if inplace else df
        if reset_index:
            out.reset_index(drop=True, inplace=True)
        if returninds:
            return sortorder
        else:
            return out




[docs]
    def sort(self, by=None, reverse=False, returninds=False, inplace=True, **kwargs):
        """
        Alias to :meth:`sortrows() <dataframe.sortrows>`.

        *New in version 3.0.0.*
        """
        return self.sortrows(by=by, reverse=reverse, returninds=returninds, inplace=True, **kwargs)




[docs]
    def sortcols(self, sortorder=None, reverse=False, inplace=True):
        """
        Like sortrows(), but change column order (usually in place) instead.

        Args:
            sortorder (list): the list of indices to resort the columns by (if none, then alphabetical)
            reverse (bool): whether to reverse the order
            inplace (bool): whether to modify the dataframe in-place

        *New in version 3.0.0:* Ensure dtypes are preserved; "inplace" argument; "returninds" argument removed
        """
        if sortorder is None:
            sortorder = np.argsort(self.cols, kind='mergesort')
            if reverse:
                sortorder = sortorder[::-1]
        newcols = list(np.array(self.cols)[sortorder])
        newdf = dataframe({k:self[k] for k in newcols})
        return self.replacedata(newdf=newdf, inplace=inplace)




[docs]
    def to_pandas(self, **kwargs):
        """ Convert to a plain pandas dataframe """
        return pd.DataFrame(self)



[docs]
    @classmethod
    def read_csv(cls, *args, **kwargs):
        """ Alias to :func:`pd.read_csv <pandas.read_csv`, returning a Sciris dataframe """
        return cls(pd.read_csv(*args, **kwargs))



[docs]
    @classmethod
    def read_excel(cls, *args, **kwargs):
        """ Alias to :func:`pd.read_excel <pandas.read_excel`, returning a Sciris dataframe """
        out = pd.read_excel(*args, **kwargs)
        if isinstance(out, pd.DataFrame):
            out = cls(out)
        elif isinstance(out, dict):
            out = sc.objdict(out)
            for key,val in out.items():
                out[key] = cls(val)
        return out


    @property
    def _constructor(self):
        """ Overload pandas method to ensure correct type; replaces :class:`pd.DataFrame() <pandas.DataFrame>` """
        return self.__class__ # To allow subclassing