Source code for bpd

from collections.abc import Iterable

import numpy as np
import pandas as pd

from pandas.core import common as com
from pandas.core import indexing

pd.set_option("display.max_rows", 10)


class DataFrame(object):
    '''
    Custom DataFrame Class; Pandas DataFrames with methods removed.

    Examples
    --------
    >>> df = DataFrame.from_records([[1,2,3],[4,5,6]], columns=['a', 'b', 'c'])
    >>> df.shape
    (2, 3)
    >>> df.assign(d=[1,2]).shape
    (2, 4)
    >>> df.loc[1, 'b']
    5
    '''

[docs] def __init__(self, **kwargs): ''' Create an empty DataFrame. ''' # hidden pandas dataframe object self._pd = pd.DataFrame(**kwargs) # lift loc/iloc back to custom DataFrame objects self.loc = DataFrameIndexer(self._pd.loc) self.iloc = DataFrameIndexer(self._pd.iloc)
@property def T(self): return self.__class__(data=self._pd.T) @property def index(self): return self._pd.index @property def columns(self): return self._pd.columns @property def values(self): return self._pd.values @property def shape(self): return self._pd.shape # Formatting def __repr__(self): return self._pd.__repr__() def __str__(self): return self._pd.__str__() # return the underlying DataFrame def to_df(self): '''Return the full pandas DataFrame.''' return self._pd # Creation
[docs] @classmethod def from_dict(cls, data): """ Construct DataFrame from dict of array-like or dicts. Parameters ---------- data : dict Of the form {field : array-like} or {field : dict}. Returns ------- DataFrame """ return cls(data=data)
[docs] @classmethod def from_records(cls, data, *, columns=None): """ Convert structured or record ndarray to DataFrame. Parameters ---------- data : ndarray (structured dtype), list of tuples, dict, or DataFrame columns : sequence, default None, keyword-only Column names to use. If the passed data do not have names associated with them, this argument provides names for the columns. Otherwise this argument indicates the order of the columns in the result (any names not found in the data will become all-NA columns) Returns ------- DataFrame """ return cls(data=data, columns=columns)
# Dunder Attributes def _repr_html_(self): f = _lift_to_pd(self._pd._repr_html_) return f() def __getitem__(self, key): if getattr(key, 'to_ser', None): # Convert to pd.Series key = key.to_ser() if not com.is_bool_indexer(key): raise IndexError('BabyPandas only accepts Boolean objects ' 'when indexing against the data frame; ' 'please use .get to get columns, and ' '.loc or .iloc for more complex cases.') f = _lift_to_pd(self._pd._getitem_bool_array) return f(key) # Selection
[docs] def take(self, indices): """ Return the rows in the given *positional* indices. This means that we are not indexing according to actual values in the index attribute of the object. We are indexing according to the actual position of the element in the object. Parameters ---------- indices : array-like An array of ints indicating which positions to take. Returns ------- taken : DataFrame An DataFrame containing the elements taken from the object. Raises ------ IndexError If any `indices` are out of bounds with respect to DataFrame length. Examples -------- >>> df = bpd.DataFrame().assign(name=['falcon', 'parrot', 'lion'], ... kind=['bird', 'bird', 'mammal']) >>> df name kind 0 falcon bird 1 parrot bird 2 lion mammal >>> df.take([0, 2]) name kind 0 falcon bird 2 lion mammal """ if not isinstance(indices, Iterable): raise TypeError('Argument `indices` must be a list-like object') if not all(isinstance(x, (int, np.integer)) for x in indices): raise ValueError('Argument `indices` must only contain integers') if not all(x < self._pd.shape[0] for x in indices): raise IndexError('Indices are out-of-bounds') f = _lift_to_pd(self._pd.take) return f(indices=indices)
[docs] def drop(self, *, columns=None): """ Remove columns by specifying column names. Parameters ---------- columns : single label or list-like Column names to drop. Returns ------- df : DataFrame DataFrame with the dropped columns. Raises ------ KeyError If none of the column labels are found. Examples -------- >>> df = bpd.DataFrame().assign(A=[0, 4, 8], ... B=[1, 5, 9], ... C=[2, 6, 10], ... D=[3, 7, 11]) >>> df A B C D 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 >>> df.drop(columns=['B', 'C']) A D 0 0 3 1 4 7 2 8 11 """ if not isinstance(columns, Iterable): raise TypeError('Argument `columns` must be a string label or list of string labels') mask = [columns not in self.columns] if isinstance(columns, str) else [x not in self.columns for x in columns] if any(mask): c = [columns] if isinstance(columns, str) else columns raise KeyError('{} not found in columns'.format(np.array(c)[mask])) f = _lift_to_pd(self._pd.drop) return f(columns=columns)
[docs] def sample(self, n=None, *, replace=False, random_state=None): ''' Return a random sample of rows from a data frame. You can use `random_state` for reproducibility. Parameters ---------- n : None or int, optional Number of rows to return. None corresponds to 1. replace : {False, True}, optional, keyword only. Sample with or without replacement. random_state : int or numpy.random.RandomState, optional, keyword only Seed for the random number generator (if int), or numpy RandomState object. Returns ------- s_df : DataFrame A new DataFrame containing `n` items randomly sampled from the caller object. Raises ------ ValueError If a sample larger than the length of the DataFrame is taken without replacement. Examples -------- >>> df = bpd.DataFrame().assign(letter=['a', 'b', 'c'], ... count=[9, 3, 3], ... points=[1, 2, 2]) >>> df.sample(1, random_state=0) letter count points 2 c 3 2 ''' if not isinstance(n, int) and n != None: raise TypeError('Argument `n` not an integer') if not isinstance(replace, bool): raise TypeError('Argument `replace` not a boolean') if not isinstance(random_state, int) and random_state != None: raise TypeError('Argument `random_state` must be an integer or None') if n != None and n > self._pd.shape[0] and replace == False: raise ValueError('Cannot take a larger sample than length of DataFrame when `replace=False`') f = _lift_to_pd(self._pd.sample) return f(n=n, replace=replace, random_state=random_state)
[docs] def get(self, key): ''' Return column or columns from data frame. Parameters ---------- key : str or iterable of strings Column label or iterable of column labels Returns ------- series_or_df : Series or DataFrame Series with the corresponding label or DataFrame with the corresponding column labels. Raises ------ KeyError If any column named in `key` not found in columns. Examples -------- >>> df = bpd.DataFrame().assign(letter=['a', 'b', 'c'], ... count=[9, 3, 3], ... points=[1, 2, 2]) >>> df.get('letter') 0 a 1 b 2 c Name: letter, dtype: object >>> df.get(['count', 'points']) count points 0 9 1 1 3 2 2 3 2 ''' if not isinstance(key, str) and not isinstance(key, Iterable): raise TypeError('Argument `key` must be a string label or list of string labels') mask = [key not in self.columns] if isinstance(key, str) else [x not in self.columns for x in key] if any(mask): k = [key] if isinstance(key, str) else key raise KeyError('{} not found in columns'.format(np.array(k)[mask])) f = _lift_to_pd(self._pd.get) return f(key=key)
# Creation
[docs] def assign(self, **kwargs): ''' Assign new columns to a DataFrame. Returns a new object with all original columns in addition to new ones. Existing columns that are re-assigned will be overwritten. Parameters ---------- **kwargs : dict of {str: callable or Series} The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not change input DataFrame (though pandas doesn't check it). If the values are not callable, (e.g. a Series, scalar, or array), they are simply assigned. Returns ------- df_with_cols : DataFrame A new DataFrame with the new columns in addition to all the existing columns. Raises ------ ValueError If columns have different lengths or if new columns have different lengths than the existing DataFrame Examples -------- >>> df = bpd.DataFrame().assign(flower=['sunflower', 'rose']) >>> df.assign(color=['yellow', 'red']) flower color 0 sunflower yellow 1 rose red ''' if len(set(map(len, kwargs.values()))) not in (0, 1): raise ValueError('Not all columns have the same length') if self._pd.shape[1] != 0: if len(list(kwargs.values())[0]) != self._pd.shape[0]: raise ValueError('New column does not have the same length as existing DataFrame') f = _lift_to_pd(self._pd.assign) return f(**kwargs)
# Transformation
[docs] def apply(self, func, axis=0): """ Apply a function along an axis of the DataFrame. Objects passed to the function are Series objects whose index is either the DataFrame's index (``axis=0``) or the DataFrame's columns (``axis=1``). The final return type is inferred from the return type of the applied function. Parameters ---------- func : function Function to apply to each column or row. axis : {0 or 'index', 1 or 'columns'}, default 0 Axis along which the function is applied: * 0 or 'index': apply function to each column. * 1 or 'columns': apply function to each row. Returns ------- applied : Series or DataFrame Result of applying ``func`` along the given axis of the DataFrame. Examples -------- >>> def add_two(row): ... return row + 2 >>> df = bpd.DataFrame(A=[1, 1], ... B=[2, 2]) >>> df.apply(add_two) A B 0 3 4 1 3 4 """ if not callable(func): raise TypeError('Argument `func` must be a function') if axis not in [0, 1, 'index', 'columns']: raise ValueError('Argument `axis` must be one of 0, 1 ' '"index" or "columns"') f = _lift_to_pd(self._pd.apply) return f(func=func, axis=axis)
[docs] def sort_values(self, by, *, ascending=True): ''' Sort by the values in column(s) named in `by`. Parameters ---------- by : str or list of str Name or list of column names to sort by. ascending : {True, False} or list of bool, keyword only Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the `by`. Default is True. Returns ------- sorted_obj : DataFrame Raises ------ KeyError If `by` not found in columns. Examples -------- >>> df = bpd.DataFrame().assign(name=['Sally', 'George', 'Bill', 'Ann'], ... age=[21, 25, 18, 28], ... height_cm=[161, 168, 171, 149]) >>> df.sort_values(by='age') name age height_cm 2 Bill 18 171 0 Sally 21 161 1 George 25 168 3 Ann 28 149 >>> df.sort_values(by='height_cm', ascending=False) name age height_cm 2 Bill 18 171 1 George 25 168 0 Sally 21 161 3 Ann 28 149 ''' if not isinstance(by, Iterable): raise TypeError('Argument `by` must be a string label or list of string labels') mask = [by not in self.columns] if isinstance(by, str) else [x not in self.columns for x in by] if any(mask): b = [by] if isinstance(by, str) else by raise KeyError('{} not found in columns'.format(np.array(b)[mask])) if not isinstance(ascending, bool): raise TypeError('Argument `ascending` must be a boolean') f = _lift_to_pd(self._pd.sort_values) return f(by=by, ascending=ascending)
[docs] def describe(self): ''' Generate descriptive statistics. Statistics summarize the central tendency, dispersion and shape of a dataset's distribution, excluding ``NaN`` values. Analyzes both numeric and object series, as well as ``DataFrame`` column sets of mixed data types. Parameters ---------- None Returns ------- descr : DataFrame Summary statistics of the Dataframe provided. Examples -------- >>> df = bpd.DataFrame().assign(A=[0, 10, 20], ... B=[1, 2, 3]) >>> df.describe() A B count 3.0 3.0 mean 10.0 2.0 std 10.0 1.0 min 0.0 1.0 25% 5.0 1.5 50% 10.0 2.0 75% 15.0 2.5 max 20.0 3.0 ''' f = _lift_to_pd(self._pd.describe) return f()
[docs] def groupby(self, by): ''' Group DataFrame by values in columns specified in `by`. A groupby operation involves some combination of splitting the object, applying a function, and combining the results. this can be used to group large amounts of data and compute operations on these groups. Parameters ---------- by : label, or list of labels Used to determine the groups for the groupby. Should be a label or list of labels that will group by the named columns in ``self``. Notice that a tuple is interpreted a (single) key. Returns ------- df_gb : DataFrameGroupBy groupby object that contains information about the groups. Raises ------- KeyError If `by` not found in columns Examples -------- >>> df =bpd.DataFrame(animal=['Falcon', 'Falcon', 'Parrot', 'Parrot'], ... max_speed=[380, 370, 24, 26]) >>> df.groupby('animal').mean() max_speed animal Falcon 375.0 Parrot 25.0 ''' if not isinstance(by, Iterable): raise TypeError('Argument `by` must be a string label or list of string labels') mask = [by not in self.columns] if isinstance(by, str) else [x not in self.columns for x in by] if any(mask): b = [by] if isinstance(by, str) else by raise KeyError('{} not found in columns'.format(np.array(b)[mask])) f = _lift_to_pd(self._pd.groupby) return f(by=by)
[docs] def reset_index(self, *, drop=False): ''' Reset the index. Reset the index of the DataFrame, and use the default one instead. Parameters ---------- drop : bool, default False, keyword only Do not try to insert index into dataframe columns. This resets the index to the default integer index. Returns ------- DataFrame DataFrame with the new index. Reset the index of the DataFrame, and use the default one instead. If the DataFrame has a MultiIndex, this method can remove one or more levels. Examples -------- >>> df = bpd.DataFrame().assign(name=['Sally', 'George', 'Bill', 'Ann'], ... age=[21, 25, 18, 28], ... height_cm=[161, 168, 171, 149]) >>> sorted = df.sort_values(by='age') >>> sorted name age height_cm 2 Bill 18 171 0 Sally 21 161 1 George 25 168 3 Ann 28 149 >>> sorted.reset_index(drop=True) name age height_cm 0 Bill 18 171 1 Sally 21 161 2 George 25 168 3 Ann 28 149 ''' if not isinstance(drop, bool): raise TypeError('Argument `drop` must be a boolean') f = _lift_to_pd(self._pd.reset_index) return f(drop=drop)
[docs] def set_index(self, keys, drop=True): ''' Set the DataFrame index using existing columns. Set the DataFrame index (row labels) using one or more existing columns or arrays (of the correct length). The index replaces the existing index. Parameters ---------- keys : label or array-like or list of labels/arrays This parameter can be either a single column key, a single array of the same length as the calling DataFrame, or a list containing an arbitrary combination of column keys and arrays. Here, "array" encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. drop : bool, default True Delete columns to be used as the new index. Returns ------- DataFrame Data frame with changed row labels. Raises ------ KeyError If `keys` not found in columns. Examples -------- >>> df = bpd.DataFrame().assign(name=['Sally', 'George', 'Bill', 'Ann'], ... age=[21, 25, 18, 28], ... height_cm=[161, 168, 171, 149]) >>> df.set_index('name') age height_cm name Sally 21 161 George 25 168 Bill 18 171 Ann 28 149 ''' if not isinstance(keys, Iterable): raise TypeError('Argument `keys` must be a string label or list of string labels') mask = [keys not in self.columns] if isinstance(keys, str) else [x not in self.columns for x in keys] if any(mask): k = [keys] if isinstance(keys, str) else keys raise KeyError('{} not found in columns'.format(np.array(k)[mask])) if not isinstance(drop, bool): raise TypeError('Argument `drop` must be a boolean') f = _lift_to_pd(self._pd.set_index) return f(keys=keys, drop=drop)
# Combining
[docs] def merge( self, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False ): ''' Merge DataFrame or named Series objects with a database-style join. The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. Parameters ---------- right : DataFrame or named Series Object to merge with. how : {'left', 'right', 'outer', 'inner'}, default 'inner' Type of merge to be performed. \* left: use only keys from left frame, similar to a SQL left outer join; preserve key order. \* right: use only keys from right frame, similar to a SQL right outer join; preserve key order. \* outer: use union of keys from both frames, similar to a SQL full outer join; sort keys lexicographically. \* inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults to the intersection of the columns in both DataFrames. left_on : label or list, or array-like Column or index level names to join on in the left DataFrame. Can also be an array or list of arrays of the length of the left DataFrame. These arrays are treated as if they are columns. right_on : label or list, or array-like Column or index level names to join on in the right DataFrame. Can also be an array or list of arrays of the length of the right left_index : boolean, default False Use the index from the left DataFrame as the join key(s). If it is a MultiIndex, the number of keys in the other DataFrame (either the index or a number of columns) must match the number of levels right_index : boolean, default False Use the index from the right DataFrame as the join key. Same caveats as left_index DataFrame. These arrays are treated as if they are columns. Returns ------- DataFrame A DataFrame of the two merged objects. Raises ------ KeyError If any input labels are not found in the corresponding DataFrame's columns. Examples -------- >>> df1 = bpd.DataFrame().assign(pet=['dog', 'cat', 'lizard', 'turtle'], ... kind=['mammal', 'mammal', 'reptile', 'reptile']) >>> df2 = bpd.DataFrame().assign(kind=['mammal', 'reptile', 'amphibian'], ... abr=['m', 'r', 'a']) >>> df1.merge(df2, on='kind') pet kind abr 0 dog mammal m 1 cat mammal m 2 lizard reptile r 3 turtle reptile r ''' using_index = left_index or right_index if not isinstance(right, DataFrame): raise TypeError('Argument `right` must by a DataFrame') if how not in ['left', 'right', 'outer', 'inner']: raise ValueError('Argument `how` must be either \'left\', \'right\', \'outer\', or \'inner\'') if (on not in self._pd.columns or on not in right.columns) and on != None: raise KeyError('Label \'{}\' not found in both DataFrames'.format(on)) if not using_index and ((left_on == None and right_on != None) or (left_on != None and right_on == None)): raise KeyError('Both `left_on` and `right_on` must be column labels') if left_on != None and right_on != None: if left_on not in self._pd.columns: raise KeyError('Label \'{}\' not found in left DataFrame'.format(left_on)) if right_on not in right.columns: raise KeyError('Label \'{}\' not found in right DataFrame'.format(right_on)) f = _lift_to_pd(self._pd.merge) return f( right=right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index )
[docs] def append(self, other, ignore_index=False): ''' Append rows of `other` to the end of caller, returning a new object. Columns in `other` that are not in the caller are added as new columns. Parameters ---------- other : DataFrame or Series/dict-like object, or list of these The data to append. ignore_index : boolean, default False If True, do not use the index labels. Returns ------- a_df : DataFrame DataFrame with appended rows. ''' if not isinstance(other, DataFrame): raise TypeError('Argument `other` must by a DataFrame') if not isinstance(ignore_index, bool): raise TypeError('Argument `ignore_index` must be a boolean') f = _lift_to_pd(self._pd.append) return f(other=other, ignore_index=ignore_index)
# Plotting
[docs] def plot(self, *args, **kwargs): """ DataFrame plotting accessor and method Examples -------- >>> df.plot.line() >>> df.plot.scatter('x', 'y') >>> df.plot.hexbin() """ f = _lift_to_pd(self._pd.plot) return f(*args, **kwargs)
# IO
[docs] def to_csv(self, path_or_buf=None, *, index=True): ''' Write object to a comma-separated values (csv) file. Parameters ---------- path_or_buf : str or file handle, default None File path or object, if None is provided the result is returned as a string. index : bool, default True Write row names (index). Returns ------- None or str If path_or_buf is None, returns the resulting csv format as a string. Otherwise returns None. ''' if not isinstance(index, bool): raise TypeError('Argument `index` must be a boolean') f = _lift_to_pd(self._pd.to_csv) return f(path_or_buf=path_or_buf, index=index)
[docs] def to_numpy(self): ''' Convert the DataFrame to a NumPy array. By default, the dtype of the returned array will be the common NumPy dtype of all types in the DataFrame. For example, if the dtypes are ``float16`` and ``float32``, the results dtype will be ``float32``. This may require copying data and coercing values, which may be expensive. Parameters ---------- None Returns ------- df_arr : numpy.ndarray DataFrame as a NumPy array. ''' f = _lift_to_pd(self._pd.to_numpy) return f()
class SeriesStringMethods(object): ''' String methods on Series objects. Will return bpd.Series ''' def __init__(self, methods): self._methods = methods def __getattr__(self, name): return _lift_to_pd(getattr(self._methods, name)) def __dir__(self): return [x for x in dir(self._methods) if not x.startswith('_')] class Series(object): ''' Custom Series class; Pandas Series with methods removed. '''
[docs] def __init__(self, **kwargs): ''' Create an empty Series. ''' # hidden pandas dataeriesframe object self._pd = pd.Series(**kwargs) # lift loc/iloc back to custom Series objects self.loc = DataFrameIndexer(self._pd.loc) self.iloc = DataFrameIndexer(self._pd.iloc) self.shape = _lift_to_pd(self._pd.shape) self.index = _lift_to_pd(self._pd.index) self.values = _lift_to_pd(self._pd.values)
@property def str(self): ''' String methods on Series. ''' # accessing the `.str` attribute of a pd.Series will raise an # AttributeError if the series does not consist of string values. We # use a property here to replicate this behavior. return SeriesStringMethods(self._pd.str) # Formatting def __repr__(self): return self._pd.__repr__() def __str__(self): return self._pd.__str__() def __getitem__(self, key): if getattr(key, 'to_ser', None): # Convert to pd.Series key = key.to_ser() if not com.is_bool_indexer(key): raise IndexError('BabyPandas only accepts Boolean objects ' 'when indexing against the Series; please use ' '.loc or .iloc for more complex cases.') key = indexing.check_bool_indexer(self.index, key) f = _lift_to_pd(self._pd._get_with) return f(key) # Selection
[docs] def take(self, indices): ''' Return the elements in the given *positional* indices. This means that we are not indexing according to actual values in the index attribute of the object. We are indexing according to the actual position of the element in the object. Parameters ---------- indices : array-like An array of ints indicating which positions to take. Returns ------- taken : Series A Series containing the elements taken from the object. Raises ------ IndexError If any `indices` are out of bounds with respect to Series length. Examples -------- >>> s = bpd.Series(data=[1, 2, 3], index=['A', 'B', 'C']) >>> s.take([0, 3]) A 1 C 3 dtype: int64 >>> s.take(np.arange(2)) A 1 B 2 dtype: int64 ''' if not isinstance(indices, Iterable): raise TypeError('Argument `indices` must be a list-like object') if not all(isinstance(x, (int, np.integer)) for x in indices): raise ValueError('Argument `indices` must only contain integers') if not all(x < self._pd.shape[0] for x in indices): raise IndexError('Indices are out-of-bounds') f = _lift_to_pd(self._pd.take) return f(indices)
[docs] def sample(self, n=None, replace=False, random_state=None): ''' Return a random sample of elements from a Series. You can use `random_state` for reproducibility. Parameters ---------- n : None or int, optional Number of elements to return. None corresponds to 1. replace : {False, True}, optional, keyword only. Sample with or without replacement. random_state : int or numpy.random.RandomState, optional, keyword only Seed for the random number generator (if int), or numpy RandomState object. Returns ------- s_series : Series A new Series containing `n` items randomly sampled from the caller object. Raises ------ ValueError If a sample larger than the length of the Series is taken without replacement. Examples -------- >>> s = bpd.Series(data=[1, 2, 3, 4, 5]) >>> s.sample(3, random_state=0) 2 3 0 1 1 2 dtype: int64 >>> s.sample(7, replace=True, random_state=10) 1 2 4 5 0 1 1 2 3 4 4 5 1 2 dtype: int64 ''' if not isinstance(n, int) and n != None: raise TypeError('Argument `n` not an integer') if not isinstance(replace, bool): raise TypeError('Argument `replace` not a boolean') if not isinstance(random_state, int) and random_state != None: raise TypeError('Argument `random_state` must be an integer or None') if n != None and n > self._pd.shape[0] and replace == False: raise ValueError('Cannot take a larger sample than length of DataFrame when `replace=False`') f = _lift_to_pd(self._pd.sample) return f(n=n, replace=replace, random_state=random_state)
def get(self, key, default=None): """ Get item from object for given key (ex: Series entry). Returns default value if not found. Parameters ---------- key : object Returns ------- value : same type as items contained in object """ f = _lift_to_pd(self._pd.get) return f(key, default=default) # Transformation
[docs] def apply(self, func): ''' Invoke function on values of Series. Can be ufunc (a NumPy function that applies to the entire Series) or a Python function that only works on single values. Parameters ---------- func : function Python function or NumPy ufunc to apply. Returns ------- a_obj : Series or DataFrame If func returns a Series object the result will be a DataFrame. Examples -------- >>> def cut_off_5(val): ... if val > 5: ... return 5 ... else: ... return val >>> s = bpd.Series(data=[1, 3, 5, 7, 9] >>> s.apply(cut_off_5) 0 1 1 3 2 5 3 5 4 5 dtype: int64 ''' if not callable(func): raise TypeError('Argument `func` must be a function') f = _lift_to_pd(self._pd.apply) return f(func=func)
[docs] def sort_values(self, *, ascending=True): ''' Sort by the values. Sort a Series in ascending or descending order. Parameters ---------- ascending : bool, default True, keyword only If True, sort values in ascending order, otherwise descending. Returns ------- s_series : Series Series ordered by values. Example ------- >>> s = bpd.Series(data=[6, 4, 3, 9, 5]) >>> s.sort_values() 2 3 1 4 4 5 0 6 3 9 dtype: int64 >>> s.sort_values(ascending=False) 3 9 0 6 4 5 1 4 2 3 dtype: int64 ''' if not isinstance(ascending, bool): raise TypeError('Argument `ascending` must be a boolean') f = _lift_to_pd(self._pd.sort_values) return f(ascending=ascending)
def unique(self): ''' Return unique values of Series object. Parameters ---------- None Returns ------- values : ndarray A NumPy array containing the unique values, in order of appearance. Examples -------- >>> s = bpd.Series(data=[6, 7, 7, 5, 9, 5, 1]) >>> s.unique() array([6, 7, 5, 9, 1]) ''' f = _lift_to_pd(self._pd.unique) return f()
[docs] def describe(self): ''' Generate descriptive statistics. Statistics summarize the central tendency, dispersion and shape of a Series' distribution, excluding ``NaN`` values. Analyzes both numeric and object series. Parameters ---------- None Returns ------- descr : Series Summary statistics of the Series provided. Examples -------- >>> s = bpd.Series(data=[6, 7, 7, 5, 9, 5, 1]) >>> s.describe() count 7.000000 mean 5.714286 std 2.497618 min 1.000000 25% 5.000000 50% 6.000000 75% 7.000000 max 9.000000 dtype: float64 ''' f = _lift_to_pd(self._pd.describe) return f()
[docs] def reset_index(self, *, drop=False): ''' Reset the index. This is useful when the index is meaningless and needs to be reset to the default before another operation. Parameters ---------- drop : bool, default False, keyword only When True, do not try to insert index into dataframe columns. This resets the index to the default integer index. If False, then turn input Series into DataFrame, adding original index as column. Returns ------- Series or DataFrame When `drop` is False (the default), a DataFrame is returned. The newly created columns will come first in the DataFrame, followed by the original Series values. When `drop` is True, a `Series` is returned. Examples -------- >>> s = bpd.Series([6, 4, 3, 9, 5]) >>> sorted = s.sort_values() >>> sorted.reset_index() index 0 0 2 3 1 1 4 2 4 5 3 0 6 4 3 9 >>> sorted.reset_index(drop=True) 0 3 1 4 2 5 3 6 4 9 dtype: int64 ''' if not isinstance(drop, bool): raise TypeError('Argument `drop` must be a boolean') f = _lift_to_pd(self._pd.reset_index) return f(drop=drop)
def where(self, cond, other): ''' Replace values where the condition is False. Parameters ---------- cond : boolean Series, array-like, or callable Where cond is True, keep the original value. Where False, replace with corresponding value from other. If cond is callable, it is computed on the Series and should return boolean Series or array. other : scalar, Series/DataFrame, or callable Entries where cond is False are replaced with corresponding value from other. If other is callable, it is computed on the Series and should return scalar or Series. Returns ------- s_series : Series A new Series with the values replaced when the condition is False. Notes ----- The `where` method is an application of the if-then idiom. For each element in the calling Series, if ``cond`` is ``True`` the element is used; otherwise the corresponding element from the Series ``other`` is used. The signature for :func:`Series.where` differs from :func:`numpy.where`. Roughly ``ser1.where(m, ser2)`` is equivalent to ``np.where(m, ser1, ser2)``. Examples -------- >>> s = pd.Series(range(5)) >>> s.where(s > 1, 10) 0 10 1 10 2 2 3 3 4 4 dtype: int64 ''' f = _lift_to_pd(self._pd.where) return f(cond, other) # Plotting
[docs] def plot(self, *args, **kwargs): ''' Series plotting accessor and method. Examples -------- >>> s.plot.line() >>> s.plot.bar() >>> s.plot.hist() ''' f = _lift_to_pd(self._pd.plot) return f(*args, **kwargs)
# IO
[docs] def to_csv(self, path_or_buf=None, index=True): ''' Write object to a comma-separated values (csv) file. Parameters ---------- path_or_buf : str or file handle, default None File path or object, if None is provided the result is returned as a string. index : bool, default True Write row names (index). Returns ------- None or str If path_or_buf is None, returns the resulting csv format as a string. Otherwise returns None. ''' if not isinstance(index, bool): raise TypeError('Argument `index` must be a boolean') f = _lift_to_pd(self._pd.to_csv) return f(path_or_buf=path_or_buf, index=index)
[docs] def to_numpy(self): ''' A NumPy ndarray representing the values in this Series or Index. Parameters ---------- None Returns ------- arr : numpy.ndarray ''' f = _lift_to_pd(self._pd.to_numpy) return f()
# Calculations
[docs] def count(self): ''' Return number of non-NA/null observations in the Series. ''' f = _lift_to_pd(self._pd.count) return f()
[docs] def mean(self): ''' Return the mean of the values for the requested axis. ''' f = _lift_to_pd(self._pd.mean) return f()
[docs] def median(self): ''' Return the median of the values for the requested axis. ''' f = _lift_to_pd(self._pd.median) return f()
[docs] def min(self): ''' Return the minimum of the values in the Series. ''' f = _lift_to_pd(self._pd.min) return f()
[docs] def max(self): ''' Return the maximum of the values in the Series. ''' f = _lift_to_pd(self._pd.max) return f()
[docs] def sum(self): ''' Return the sum of the values in the Series. ''' f = _lift_to_pd(self._pd.sum) return f()
[docs] def abs(self): ''' Return a Series with absolute numeric value of each element. ''' f = _lift_to_pd(self._pd.abs) return f()
# Arithmetic def __add__(self, other): f = _lift_to_pd(self._pd.__add__) return f(other) def __radd__(self, other): f = _lift_to_pd(self._pd.__radd__) return f(other) def __mul__(self, other): f = _lift_to_pd(self._pd.__mul__) return f(other) def __rmul__(self, other): f = _lift_to_pd(self._pd.__rmul__) return f(other) def __pow__(self, other): f = _lift_to_pd(self._pd.__pow__) return f(other) def __sub__(self, other): f = _lift_to_pd(self._pd.__sub__) return f(other) def __rsub__(self, other): f = _lift_to_pd(self._pd.__rsub__) return f(other) def __neg__(self): f = _lift_to_pd(self._pd.__neg__) return f() def __truediv__(self, other): f = _lift_to_pd(self._pd.__truediv__) return f(other) def __mod__(self, other): f = _lift_to_pd(self._pd.__mod__) return f(other) # comparison def __eq__(self, other): f = _lift_to_pd(self._pd.__eq__) return f(other) def __ne__(self, other): f = _lift_to_pd(self._pd.__ne__) return f(other) def __gt__(self, other): f = _lift_to_pd(self._pd.__gt__) return f(other) def __lt__(self, other): f = _lift_to_pd(self._pd.__lt__) return f(other) def __ge__(self, other): f = _lift_to_pd(self._pd.__ge__) return f(other) def __le__(self, other): f = _lift_to_pd(self._pd.__le__) return f(other) # bitwise operators def __and__(self, other): f = _lift_to_pd(self._pd.__and__) return f(other) def __or__(self, other): f = _lift_to_pd(self._pd.__or__) return f(other) def __xor__(self, other): f = _lift_to_pd(self._pd.__xor__) return f(other) # othe dunder methods def __len__(self): return self._pd.__len__() def __invert__(self): '''unary inversion, ~ operator''' f = _lift_to_pd(self._pd.__invert__) return f() # array interface (for applying numpy functions) def __array__(self, *vargs, **kwargs): return self._pd.__array__(*vargs, **kwargs) # return the underlying Series def to_ser(self): '''Return the underlying Pandas series''' return self._pd class DataFrameGroupBy(object): ''' ''' def __init__(self, groupby): # hidden pandas dataframe object self._pd = groupby # return the underlying groupby object def to_gb(self): '''return the underlying pandas groupby object''' return self._pd def aggregate(self, func): if not callable(func): raise Exception('Provide a function to aggregate') return self._pd.aggregate(func) # Calculations
[docs] def count(self): ''' Compute count of group. ''' f = _lift_to_pd(self._pd.count) return f()
[docs] def mean(self): ''' Compute mean of group. ''' f = _lift_to_pd(self._pd.mean) return f()
[docs] def median(self): ''' Compute median of group. ''' f = _lift_to_pd(self._pd.median) return f()
[docs] def min(self): ''' Compute min of group. ''' f = _lift_to_pd(self._pd.min) return f()
[docs] def max(self): ''' Compute max of group. ''' f = _lift_to_pd(self._pd.max) return f()
[docs] def sum(self): ''' Compute sum of group. ''' f = _lift_to_pd(self._pd.sum) return f()
[docs] def size(self): ''' Compute group sizes. ''' f = _lift_to_pd(self._pd.size) return f()
class DataFrameIndexer(object): ''' Class lifts results of loc/iloc back to the custom DataFrame class. ''' def __init__(self, indexer): self.idx = indexer def __getitem__(self, item): # convert to pandas if item is baby-pandas object try: item = item._pd except AttributeError: pass # TODO: restrict what item can be? (e.g. boolean array) data = self.idx[item] if isinstance(data, pd.DataFrame): return DataFrame(data=data) elif isinstance(data, pd.Series): return Series(data=data) else: return data def _lift_to_pd(func): '''Checks output-type of function and if output is a Pandas object, lifts the output to a babypandas class''' if not callable(func): return func types = (DataFrame, DataFrameGroupBy, Series) def closure(*vargs, **kwargs): vargs = [x._pd if isinstance(x, types) else x for x in vargs] kwargs = {k: x._pd if isinstance(x, types) else x for (k, x) in kwargs.items()} a = func(*vargs, **kwargs) if isinstance(a, pd.DataFrame): return DataFrame(data=a) elif isinstance(a, pd.Series): return Series(data=a) elif isinstance(a, pd.core.groupby.generic.DataFrameGroupBy): return DataFrameGroupBy(a) else: return a closure.__doc__ = func.__doc__ return closure def read_csv(filepath, **kwargs): '''read_csv''' df = pd.read_csv(filepath, **kwargs) return DataFrame(data=df) read_csv.__doc__ = pd.read_csv.__doc__