from collections.abc import Iterable
import numpy as np
import pandas as pd
from pandas.core import common as com
from pandas.core import indexing
pd.set_option("display.max_rows", 10)
class DataFrame(object):
'''
Custom DataFrame Class; Pandas DataFrames with methods removed.
Examples
--------
>>> df = DataFrame.from_records([[1,2,3],[4,5,6]], columns=['a', 'b', 'c'])
>>> df.shape
(2, 3)
>>> df.assign(d=[1,2]).shape
(2, 4)
>>> df.loc[1, 'b']
5
'''
[docs] def __init__(self, **kwargs):
'''
Create an empty DataFrame.
'''
# hidden pandas dataframe object
self._pd = pd.DataFrame(**kwargs)
# lift loc/iloc back to custom DataFrame objects
self.loc = DataFrameIndexer(self._pd.loc)
self.iloc = DataFrameIndexer(self._pd.iloc)
@property
def T(self):
return self.__class__(data=self._pd.T)
@property
def index(self):
return self._pd.index
@property
def columns(self):
return self._pd.columns
@property
def values(self):
return self._pd.values
@property
def shape(self):
return self._pd.shape
# Formatting
def __repr__(self):
return self._pd.__repr__()
def __str__(self):
return self._pd.__str__()
# return the underlying DataFrame
def to_df(self):
'''Return the full pandas DataFrame.'''
return self._pd
# Creation
[docs] @classmethod
def from_dict(cls, data):
"""
Construct DataFrame from dict of array-like or dicts.
Parameters
----------
data : dict
Of the form {field : array-like} or {field : dict}.
Returns
-------
DataFrame
"""
return cls(data=data)
[docs] @classmethod
def from_records(cls, data, *, columns=None):
"""
Convert structured or record ndarray to DataFrame.
Parameters
----------
data : ndarray (structured dtype), list of tuples, dict, or DataFrame
columns : sequence, default None, keyword-only
Column names to use. If the passed data do not have names
associated with them, this argument provides names for the
columns. Otherwise this argument indicates the order of the columns
in the result (any names not found in the data will become all-NA
columns)
Returns
-------
DataFrame
"""
return cls(data=data, columns=columns)
# Dunder Attributes
def _repr_html_(self):
f = _lift_to_pd(self._pd._repr_html_)
return f()
def __getitem__(self, key):
if getattr(key, 'to_ser', None): # Convert to pd.Series
key = key.to_ser()
if not com.is_bool_indexer(key):
raise IndexError('BabyPandas only accepts Boolean objects '
'when indexing against the data frame; '
'please use .get to get columns, and '
'.loc or .iloc for more complex cases.')
f = _lift_to_pd(self._pd._getitem_bool_array)
return f(key)
# Selection
[docs] def take(self, indices):
"""
Return the rows in the given *positional* indices.
This means that we are not indexing according to actual values in the
index attribute of the object. We are indexing according to the actual
position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
Returns
-------
taken : DataFrame
An DataFrame containing the elements taken from the object.
Raises
------
IndexError
If any `indices` are out of bounds with respect to DataFrame
length.
Examples
--------
>>> df = bpd.DataFrame().assign(name=['falcon', 'parrot', 'lion'],
... kind=['bird', 'bird', 'mammal'])
>>> df
name kind
0 falcon bird
1 parrot bird
2 lion mammal
>>> df.take([0, 2])
name kind
0 falcon bird
2 lion mammal
"""
if not isinstance(indices, Iterable):
raise TypeError('Argument `indices` must be a list-like object')
if not all(isinstance(x, (int, np.integer)) for x in indices):
raise ValueError('Argument `indices` must only contain integers')
if not all(x < self._pd.shape[0] for x in indices):
raise IndexError('Indices are out-of-bounds')
f = _lift_to_pd(self._pd.take)
return f(indices=indices)
[docs] def drop(self, *, columns=None):
"""
Remove columns by specifying column names.
Parameters
----------
columns : single label or list-like
Column names to drop.
Returns
-------
df : DataFrame
DataFrame with the dropped columns.
Raises
------
KeyError
If none of the column labels are found.
Examples
--------
>>> df = bpd.DataFrame().assign(A=[0, 4, 8],
... B=[1, 5, 9],
... C=[2, 6, 10],
... D=[3, 7, 11])
>>> df
A B C D
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
>>> df.drop(columns=['B', 'C'])
A D
0 0 3
1 4 7
2 8 11
"""
if not isinstance(columns, Iterable):
raise TypeError('Argument `columns` must be a string label or list of string labels')
mask = [columns not in self.columns] if isinstance(columns, str) else [x not in self.columns for x in columns]
if any(mask):
c = [columns] if isinstance(columns, str) else columns
raise KeyError('{} not found in columns'.format(np.array(c)[mask]))
f = _lift_to_pd(self._pd.drop)
return f(columns=columns)
[docs] def sample(self, n=None, *, replace=False, random_state=None):
'''
Return a random sample of rows from a data frame.
You can use `random_state` for reproducibility.
Parameters
----------
n : None or int, optional
Number of rows to return. None corresponds to 1.
replace : {False, True}, optional, keyword only.
Sample with or without replacement.
random_state : int or numpy.random.RandomState, optional, keyword only
Seed for the random number generator (if int), or numpy RandomState
object.
Returns
-------
s_df : DataFrame
A new DataFrame containing `n` items randomly sampled from the
caller object.
Raises
------
ValueError
If a sample larger than the length of the DataFrame is taken
without replacement.
Examples
--------
>>> df = bpd.DataFrame().assign(letter=['a', 'b', 'c'],
... count=[9, 3, 3],
... points=[1, 2, 2])
>>> df.sample(1, random_state=0)
letter count points
2 c 3 2
'''
if not isinstance(n, int) and n != None:
raise TypeError('Argument `n` not an integer')
if not isinstance(replace, bool):
raise TypeError('Argument `replace` not a boolean')
if not isinstance(random_state, int) and random_state != None:
raise TypeError('Argument `random_state` must be an integer or None')
if n != None and n > self._pd.shape[0] and replace == False:
raise ValueError('Cannot take a larger sample than length of DataFrame when `replace=False`')
f = _lift_to_pd(self._pd.sample)
return f(n=n, replace=replace, random_state=random_state)
[docs] def get(self, key):
''' Return column or columns from data frame.
Parameters
----------
key : str or iterable of strings
Column label or iterable of column labels
Returns
-------
series_or_df : Series or DataFrame
Series with the corresponding label or DataFrame with the
corresponding column labels.
Raises
------
KeyError
If any column named in `key` not found in columns.
Examples
--------
>>> df = bpd.DataFrame().assign(letter=['a', 'b', 'c'],
... count=[9, 3, 3],
... points=[1, 2, 2])
>>> df.get('letter')
0 a
1 b
2 c
Name: letter, dtype: object
>>> df.get(['count', 'points'])
count points
0 9 1
1 3 2
2 3 2
'''
if not isinstance(key, str) and not isinstance(key, Iterable):
raise TypeError('Argument `key` must be a string label or list of string labels')
mask = [key not in self.columns] if isinstance(key, str) else [x not in self.columns for x in key]
if any(mask):
k = [key] if isinstance(key, str) else key
raise KeyError('{} not found in columns'.format(np.array(k)[mask]))
f = _lift_to_pd(self._pd.get)
return f(key=key)
# Creation
[docs] def assign(self, **kwargs):
'''
Assign new columns to a DataFrame.
Returns a new object with all original columns in addition to new ones.
Existing columns that are re-assigned will be overwritten.
Parameters
----------
**kwargs : dict of {str: callable or Series}
The column names are keywords. If the values are
callable, they are computed on the DataFrame and
assigned to the new columns. The callable must not
change input DataFrame (though pandas doesn't check it).
If the values are not callable, (e.g. a Series, scalar, or array),
they are simply assigned.
Returns
-------
df_with_cols : DataFrame
A new DataFrame with the new columns in addition to all the
existing columns.
Raises
------
ValueError
If columns have different lengths or if new columns have different lengths than the existing DataFrame
Examples
--------
>>> df = bpd.DataFrame().assign(flower=['sunflower', 'rose'])
>>> df.assign(color=['yellow', 'red'])
flower color
0 sunflower yellow
1 rose red
'''
if len(set(map(len, kwargs.values()))) not in (0, 1):
raise ValueError('Not all columns have the same length')
if self._pd.shape[1] != 0:
if len(list(kwargs.values())[0]) != self._pd.shape[0]:
raise ValueError('New column does not have the same length as existing DataFrame')
f = _lift_to_pd(self._pd.assign)
return f(**kwargs)
# Transformation
[docs] def apply(self, func, axis=0):
"""
Apply a function along an axis of the DataFrame.
Objects passed to the function are Series objects whose index is either
the DataFrame's index (``axis=0``) or the DataFrame's columns
(``axis=1``). The final return type is inferred from the return type of
the applied function.
Parameters
----------
func : function
Function to apply to each column or row.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis along which the function is applied:
* 0 or 'index': apply function to each column.
* 1 or 'columns': apply function to each row.
Returns
-------
applied : Series or DataFrame
Result of applying ``func`` along the given axis of the DataFrame.
Examples
--------
>>> def add_two(row):
... return row + 2
>>> df = bpd.DataFrame(A=[1, 1],
... B=[2, 2])
>>> df.apply(add_two)
A B
0 3 4
1 3 4
"""
if not callable(func):
raise TypeError('Argument `func` must be a function')
if axis not in [0, 1, 'index', 'columns']:
raise ValueError('Argument `axis` must be one of 0, 1 '
'"index" or "columns"')
f = _lift_to_pd(self._pd.apply)
return f(func=func, axis=axis)
[docs] def sort_values(self, by, *, ascending=True):
'''
Sort by the values in column(s) named in `by`.
Parameters
----------
by : str or list of str
Name or list of column names to sort by.
ascending : {True, False} or list of bool, keyword only
Sort ascending vs. descending. Specify list for multiple sort
orders. If this is a list of bools, must match the length of the
`by`. Default is True.
Returns
-------
sorted_obj : DataFrame
Raises
------
KeyError
If `by` not found in columns.
Examples
--------
>>> df = bpd.DataFrame().assign(name=['Sally', 'George', 'Bill', 'Ann'],
... age=[21, 25, 18, 28],
... height_cm=[161, 168, 171, 149])
>>> df.sort_values(by='age')
name age height_cm
2 Bill 18 171
0 Sally 21 161
1 George 25 168
3 Ann 28 149
>>> df.sort_values(by='height_cm', ascending=False)
name age height_cm
2 Bill 18 171
1 George 25 168
0 Sally 21 161
3 Ann 28 149
'''
if not isinstance(by, Iterable):
raise TypeError('Argument `by` must be a string label or list of string labels')
mask = [by not in self.columns] if isinstance(by, str) else [x not in self.columns for x in by]
if any(mask):
b = [by] if isinstance(by, str) else by
raise KeyError('{} not found in columns'.format(np.array(b)[mask]))
if not isinstance(ascending, bool):
raise TypeError('Argument `ascending` must be a boolean')
f = _lift_to_pd(self._pd.sort_values)
return f(by=by, ascending=ascending)
[docs] def describe(self):
'''
Generate descriptive statistics.
Statistics summarize the central tendency, dispersion and shape of a
dataset's distribution, excluding ``NaN`` values.
Analyzes both numeric and object series, as well
as ``DataFrame`` column sets of mixed data types.
Parameters
----------
None
Returns
-------
descr : DataFrame
Summary statistics of the Dataframe provided.
Examples
--------
>>> df = bpd.DataFrame().assign(A=[0, 10, 20],
... B=[1, 2, 3])
>>> df.describe()
A B
count 3.0 3.0
mean 10.0 2.0
std 10.0 1.0
min 0.0 1.0
25% 5.0 1.5
50% 10.0 2.0
75% 15.0 2.5
max 20.0 3.0
'''
f = _lift_to_pd(self._pd.describe)
return f()
[docs] def groupby(self, by):
'''
Group DataFrame by values in columns specified in `by`.
A groupby operation involves some combination of splitting the object,
applying a function, and combining the results. this can be used to
group large amounts of data and compute operations on these groups.
Parameters
----------
by : label, or list of labels
Used to determine the groups for the groupby. Should be a label or
list of labels that will group by the named columns in ``self``.
Notice that a tuple is interpreted a (single) key.
Returns
-------
df_gb : DataFrameGroupBy
groupby object that contains information about the groups.
Raises
-------
KeyError
If `by` not found in columns
Examples
--------
>>> df =bpd.DataFrame(animal=['Falcon', 'Falcon', 'Parrot', 'Parrot'],
... max_speed=[380, 370, 24, 26])
>>> df.groupby('animal').mean()
max_speed
animal
Falcon 375.0
Parrot 25.0
'''
if not isinstance(by, Iterable):
raise TypeError('Argument `by` must be a string label or list of string labels')
mask = [by not in self.columns] if isinstance(by, str) else [x not in self.columns for x in by]
if any(mask):
b = [by] if isinstance(by, str) else by
raise KeyError('{} not found in columns'.format(np.array(b)[mask]))
f = _lift_to_pd(self._pd.groupby)
return f(by=by)
[docs] def reset_index(self, *, drop=False):
'''
Reset the index.
Reset the index of the DataFrame, and use the default one instead.
Parameters
----------
drop : bool, default False, keyword only
Do not try to insert index into dataframe columns. This resets
the index to the default integer index.
Returns
-------
DataFrame
DataFrame with the new index.
Reset the index of the DataFrame, and use the default one
instead. If the DataFrame has a MultiIndex, this method can
remove one or more levels.
Examples
--------
>>> df = bpd.DataFrame().assign(name=['Sally', 'George', 'Bill', 'Ann'],
... age=[21, 25, 18, 28],
... height_cm=[161, 168, 171, 149])
>>> sorted = df.sort_values(by='age')
>>> sorted
name age height_cm
2 Bill 18 171
0 Sally 21 161
1 George 25 168
3 Ann 28 149
>>> sorted.reset_index(drop=True)
name age height_cm
0 Bill 18 171
1 Sally 21 161
2 George 25 168
3 Ann 28 149
'''
if not isinstance(drop, bool):
raise TypeError('Argument `drop` must be a boolean')
f = _lift_to_pd(self._pd.reset_index)
return f(drop=drop)
[docs] def set_index(self, keys, drop=True):
'''
Set the DataFrame index using existing columns.
Set the DataFrame index (row labels) using one or more existing
columns or arrays (of the correct length). The index replaces the
existing index.
Parameters
----------
keys : label or array-like or list of labels/arrays
This parameter can be either a single column key, a single array of
the same length as the calling DataFrame, or a list containing an
arbitrary combination of column keys and arrays. Here, "array"
encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.
drop : bool, default True
Delete columns to be used as the new index.
Returns
-------
DataFrame
Data frame with changed row labels.
Raises
------
KeyError
If `keys` not found in columns.
Examples
--------
>>> df = bpd.DataFrame().assign(name=['Sally', 'George', 'Bill', 'Ann'],
... age=[21, 25, 18, 28],
... height_cm=[161, 168, 171, 149])
>>> df.set_index('name')
age height_cm
name
Sally 21 161
George 25 168
Bill 18 171
Ann 28 149
'''
if not isinstance(keys, Iterable):
raise TypeError('Argument `keys` must be a string label or list of string labels')
mask = [keys not in self.columns] if isinstance(keys, str) else [x not in self.columns for x in keys]
if any(mask):
k = [keys] if isinstance(keys, str) else keys
raise KeyError('{} not found in columns'.format(np.array(k)[mask]))
if not isinstance(drop, bool):
raise TypeError('Argument `drop` must be a boolean')
f = _lift_to_pd(self._pd.set_index)
return f(keys=keys, drop=drop)
# Combining
[docs] def merge(
self, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False
):
'''
Merge DataFrame or named Series objects with a database-style join.
The join is done on columns or indexes. If joining columns on columns,
the DataFrame indexes *will be ignored*. Otherwise if joining indexes
on indexes or indexes on a column or columns, the index will be passed
on.
Parameters
----------
right : DataFrame or named Series
Object to merge with.
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
Type of merge to be performed.
\* left: use only keys from left frame, similar to a SQL left outer
join; preserve key order.
\* right: use only keys from right frame, similar to a SQL right
outer join; preserve key order.
\* outer: use union of keys from both frames, similar to a SQL full
outer join; sort keys lexicographically.
\* inner: use intersection of keys from both frames, similar to a
SQL inner join; preserve the order of the left keys.
on : label or list
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this
defaults to the intersection of the columns in both DataFrames.
left_on : label or list, or array-like
Column or index level names to join on in the left DataFrame. Can
also be an array or list of arrays of the length of the left
DataFrame. These arrays are treated as if they are columns.
right_on : label or list, or array-like
Column or index level names to join on in the right DataFrame. Can
also be an array or list of arrays of the length of the right
left_index : boolean, default False
Use the index from the left DataFrame as the join key(s). If it is
a MultiIndex, the number of keys in the other DataFrame (either the
index or a number of columns) must match the number of levels
right_index : boolean, default False
Use the index from the right DataFrame as the join key. Same
caveats as left_index DataFrame. These arrays are treated as if
they are columns.
Returns
-------
DataFrame
A DataFrame of the two merged objects.
Raises
------
KeyError
If any input labels are not found in the corresponding DataFrame's
columns.
Examples
--------
>>> df1 = bpd.DataFrame().assign(pet=['dog', 'cat', 'lizard', 'turtle'],
... kind=['mammal', 'mammal', 'reptile', 'reptile'])
>>> df2 = bpd.DataFrame().assign(kind=['mammal', 'reptile', 'amphibian'],
... abr=['m', 'r', 'a'])
>>> df1.merge(df2, on='kind')
pet kind abr
0 dog mammal m
1 cat mammal m
2 lizard reptile r
3 turtle reptile r
'''
using_index = left_index or right_index
if not isinstance(right, DataFrame):
raise TypeError('Argument `right` must by a DataFrame')
if how not in ['left', 'right', 'outer', 'inner']:
raise ValueError('Argument `how` must be either \'left\', \'right\', \'outer\', or \'inner\'')
if (on not in self._pd.columns or on not in right.columns) and on != None:
raise KeyError('Label \'{}\' not found in both DataFrames'.format(on))
if not using_index and ((left_on == None and right_on != None) or (left_on != None and right_on == None)):
raise KeyError('Both `left_on` and `right_on` must be column labels')
if left_on != None and right_on != None:
if left_on not in self._pd.columns:
raise KeyError('Label \'{}\' not found in left DataFrame'.format(left_on))
if right_on not in right.columns:
raise KeyError('Label \'{}\' not found in right DataFrame'.format(right_on))
f = _lift_to_pd(self._pd.merge)
return f(
right=right, how=how, on=on, left_on=left_on, right_on=right_on,
left_index=left_index, right_index=right_index
)
[docs] def append(self, other, ignore_index=False):
'''
Append rows of `other` to the end of caller, returning a new object.
Columns in `other` that are not in the caller are added as new columns.
Parameters
----------
other : DataFrame or Series/dict-like object, or list of these
The data to append.
ignore_index : boolean, default False
If True, do not use the index labels.
Returns
-------
a_df : DataFrame
DataFrame with appended rows.
'''
if not isinstance(other, DataFrame):
raise TypeError('Argument `other` must by a DataFrame')
if not isinstance(ignore_index, bool):
raise TypeError('Argument `ignore_index` must be a boolean')
f = _lift_to_pd(self._pd.append)
return f(other=other, ignore_index=ignore_index)
# Plotting
[docs] def plot(self, *args, **kwargs):
"""
DataFrame plotting accessor and method
Examples
--------
>>> df.plot.line()
>>> df.plot.scatter('x', 'y')
>>> df.plot.hexbin()
"""
f = _lift_to_pd(self._pd.plot)
return f(*args, **kwargs)
# IO
[docs] def to_csv(self, path_or_buf=None, *, index=True):
'''
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str or file handle, default None
File path or object, if None is provided the result is returned as
a string.
index : bool, default True
Write row names (index).
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
'''
if not isinstance(index, bool):
raise TypeError('Argument `index` must be a boolean')
f = _lift_to_pd(self._pd.to_csv)
return f(path_or_buf=path_or_buf, index=index)
[docs] def to_numpy(self):
'''
Convert the DataFrame to a NumPy array.
By default, the dtype of the returned array will be the common NumPy
dtype of all types in the DataFrame. For example, if the dtypes are
``float16`` and ``float32``, the results dtype will be ``float32``.
This may require copying data and coercing values, which may be
expensive.
Parameters
----------
None
Returns
-------
df_arr : numpy.ndarray
DataFrame as a NumPy array.
'''
f = _lift_to_pd(self._pd.to_numpy)
return f()
class SeriesStringMethods(object):
'''
String methods on Series objects. Will return bpd.Series
'''
def __init__(self, methods):
self._methods = methods
def __getattr__(self, name):
return _lift_to_pd(getattr(self._methods, name))
def __dir__(self):
return [x for x in dir(self._methods) if not x.startswith('_')]
class Series(object):
'''
Custom Series class; Pandas Series with methods removed.
'''
[docs] def __init__(self, **kwargs):
'''
Create an empty Series.
'''
# hidden pandas dataeriesframe object
self._pd = pd.Series(**kwargs)
# lift loc/iloc back to custom Series objects
self.loc = DataFrameIndexer(self._pd.loc)
self.iloc = DataFrameIndexer(self._pd.iloc)
self.shape = _lift_to_pd(self._pd.shape)
self.index = _lift_to_pd(self._pd.index)
self.values = _lift_to_pd(self._pd.values)
@property
def str(self):
'''
String methods on Series.
'''
# accessing the `.str` attribute of a pd.Series will raise an
# AttributeError if the series does not consist of string values. We
# use a property here to replicate this behavior.
return SeriesStringMethods(self._pd.str)
# Formatting
def __repr__(self):
return self._pd.__repr__()
def __str__(self):
return self._pd.__str__()
def __getitem__(self, key):
if getattr(key, 'to_ser', None): # Convert to pd.Series
key = key.to_ser()
if not com.is_bool_indexer(key):
raise IndexError('BabyPandas only accepts Boolean objects '
'when indexing against the Series; please use '
'.loc or .iloc for more complex cases.')
key = indexing.check_bool_indexer(self.index, key)
f = _lift_to_pd(self._pd._get_with)
return f(key)
# Selection
[docs] def take(self, indices):
'''
Return the elements in the given *positional* indices.
This means that we are not indexing according to actual values in the
index attribute of the object. We are indexing according to the actual
position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
Returns
-------
taken : Series
A Series containing the elements taken from the object.
Raises
------
IndexError
If any `indices` are out of bounds with respect to Series
length.
Examples
--------
>>> s = bpd.Series(data=[1, 2, 3], index=['A', 'B', 'C'])
>>> s.take([0, 3])
A 1
C 3
dtype: int64
>>> s.take(np.arange(2))
A 1
B 2
dtype: int64
'''
if not isinstance(indices, Iterable):
raise TypeError('Argument `indices` must be a list-like object')
if not all(isinstance(x, (int, np.integer)) for x in indices):
raise ValueError('Argument `indices` must only contain integers')
if not all(x < self._pd.shape[0] for x in indices):
raise IndexError('Indices are out-of-bounds')
f = _lift_to_pd(self._pd.take)
return f(indices)
[docs] def sample(self, n=None, replace=False, random_state=None):
'''
Return a random sample of elements from a Series.
You can use `random_state` for reproducibility.
Parameters
----------
n : None or int, optional
Number of elements to return. None corresponds to 1.
replace : {False, True}, optional, keyword only.
Sample with or without replacement.
random_state : int or numpy.random.RandomState, optional, keyword only
Seed for the random number generator (if int), or numpy RandomState
object.
Returns
-------
s_series : Series
A new Series containing `n` items randomly sampled from the caller
object.
Raises
------
ValueError
If a sample larger than the length of the Series is taken
without replacement.
Examples
--------
>>> s = bpd.Series(data=[1, 2, 3, 4, 5])
>>> s.sample(3, random_state=0)
2 3
0 1
1 2
dtype: int64
>>> s.sample(7, replace=True, random_state=10)
1 2
4 5
0 1
1 2
3 4
4 5
1 2
dtype: int64
'''
if not isinstance(n, int) and n != None:
raise TypeError('Argument `n` not an integer')
if not isinstance(replace, bool):
raise TypeError('Argument `replace` not a boolean')
if not isinstance(random_state, int) and random_state != None:
raise TypeError('Argument `random_state` must be an integer or None')
if n != None and n > self._pd.shape[0] and replace == False:
raise ValueError('Cannot take a larger sample than length of DataFrame when `replace=False`')
f = _lift_to_pd(self._pd.sample)
return f(n=n, replace=replace, random_state=random_state)
def get(self, key, default=None):
"""
Get item from object for given key (ex: Series entry).
Returns default value if not found.
Parameters
----------
key : object
Returns
-------
value : same type as items contained in object
"""
f = _lift_to_pd(self._pd.get)
return f(key, default=default)
# Transformation
[docs] def apply(self, func):
'''
Invoke function on values of Series.
Can be ufunc (a NumPy function that applies to the entire Series)
or a Python function that only works on single values.
Parameters
----------
func : function
Python function or NumPy ufunc to apply.
Returns
-------
a_obj : Series or DataFrame
If func returns a Series object the result will be a DataFrame.
Examples
--------
>>> def cut_off_5(val):
... if val > 5:
... return 5
... else:
... return val
>>> s = bpd.Series(data=[1, 3, 5, 7, 9]
>>> s.apply(cut_off_5)
0 1
1 3
2 5
3 5
4 5
dtype: int64
'''
if not callable(func):
raise TypeError('Argument `func` must be a function')
f = _lift_to_pd(self._pd.apply)
return f(func=func)
[docs] def sort_values(self, *, ascending=True):
'''
Sort by the values.
Sort a Series in ascending or descending order.
Parameters
----------
ascending : bool, default True, keyword only
If True, sort values in ascending order, otherwise descending.
Returns
-------
s_series : Series
Series ordered by values.
Example
-------
>>> s = bpd.Series(data=[6, 4, 3, 9, 5])
>>> s.sort_values()
2 3
1 4
4 5
0 6
3 9
dtype: int64
>>> s.sort_values(ascending=False)
3 9
0 6
4 5
1 4
2 3
dtype: int64
'''
if not isinstance(ascending, bool):
raise TypeError('Argument `ascending` must be a boolean')
f = _lift_to_pd(self._pd.sort_values)
return f(ascending=ascending)
def unique(self):
'''
Return unique values of Series object.
Parameters
----------
None
Returns
-------
values : ndarray
A NumPy array containing the unique values, in order of appearance.
Examples
--------
>>> s = bpd.Series(data=[6, 7, 7, 5, 9, 5, 1])
>>> s.unique()
array([6, 7, 5, 9, 1])
'''
f = _lift_to_pd(self._pd.unique)
return f()
[docs] def describe(self):
'''
Generate descriptive statistics.
Statistics summarize the central tendency, dispersion and shape of a
Series' distribution, excluding ``NaN`` values.
Analyzes both numeric and object series.
Parameters
----------
None
Returns
-------
descr : Series
Summary statistics of the Series provided.
Examples
--------
>>> s = bpd.Series(data=[6, 7, 7, 5, 9, 5, 1])
>>> s.describe()
count 7.000000
mean 5.714286
std 2.497618
min 1.000000
25% 5.000000
50% 6.000000
75% 7.000000
max 9.000000
dtype: float64
'''
f = _lift_to_pd(self._pd.describe)
return f()
[docs] def reset_index(self, *, drop=False):
'''
Reset the index.
This is useful when the index is meaningless and needs to be reset to
the default before another operation.
Parameters
----------
drop : bool, default False, keyword only
When True, do not try to insert index into dataframe columns. This
resets the index to the default integer index. If False, then turn
input Series into DataFrame, adding original index as column.
Returns
-------
Series or DataFrame
When `drop` is False (the default), a DataFrame is returned.
The newly created columns will come first in the DataFrame,
followed by the original Series values.
When `drop` is True, a `Series` is returned.
Examples
--------
>>> s = bpd.Series([6, 4, 3, 9, 5])
>>> sorted = s.sort_values()
>>> sorted.reset_index()
index 0
0 2 3
1 1 4
2 4 5
3 0 6
4 3 9
>>> sorted.reset_index(drop=True)
0 3
1 4
2 5
3 6
4 9
dtype: int64
'''
if not isinstance(drop, bool):
raise TypeError('Argument `drop` must be a boolean')
f = _lift_to_pd(self._pd.reset_index)
return f(drop=drop)
def where(self, cond, other):
'''
Replace values where the condition is False.
Parameters
----------
cond : boolean Series, array-like, or callable
Where cond is True, keep the original value. Where False, replace
with corresponding value from other. If cond is callable, it is
computed on the Series and should return boolean Series or array.
other : scalar, Series/DataFrame, or callable
Entries where cond is False are replaced with corresponding value
from other. If other is callable, it is computed on the Series
and should return scalar or Series.
Returns
-------
s_series : Series
A new Series with the values replaced when the condition is False.
Notes
-----
The `where` method is an application of the if-then idiom. For each
element in the calling Series, if ``cond`` is ``True`` the
element is used; otherwise the corresponding element from the Series
``other`` is used.
The signature for :func:`Series.where` differs from
:func:`numpy.where`. Roughly ``ser1.where(m, ser2)`` is equivalent to
``np.where(m, ser1, ser2)``.
Examples
--------
>>> s = pd.Series(range(5))
>>> s.where(s > 1, 10)
0 10
1 10
2 2
3 3
4 4
dtype: int64
'''
f = _lift_to_pd(self._pd.where)
return f(cond, other)
# Plotting
[docs] def plot(self, *args, **kwargs):
'''
Series plotting accessor and method.
Examples
--------
>>> s.plot.line()
>>> s.plot.bar()
>>> s.plot.hist()
'''
f = _lift_to_pd(self._pd.plot)
return f(*args, **kwargs)
# IO
[docs] def to_csv(self, path_or_buf=None, index=True):
'''
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str or file handle, default None
File path or object, if None is provided the result is returned as
a string.
index : bool, default True
Write row names (index).
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
'''
if not isinstance(index, bool):
raise TypeError('Argument `index` must be a boolean')
f = _lift_to_pd(self._pd.to_csv)
return f(path_or_buf=path_or_buf, index=index)
[docs] def to_numpy(self):
'''
A NumPy ndarray representing the values in this Series or Index.
Parameters
----------
None
Returns
-------
arr : numpy.ndarray
'''
f = _lift_to_pd(self._pd.to_numpy)
return f()
# Calculations
[docs] def count(self):
'''
Return number of non-NA/null observations in the Series.
'''
f = _lift_to_pd(self._pd.count)
return f()
[docs] def mean(self):
'''
Return the mean of the values for the requested axis.
'''
f = _lift_to_pd(self._pd.mean)
return f()
[docs] def min(self):
'''
Return the minimum of the values in the Series.
'''
f = _lift_to_pd(self._pd.min)
return f()
[docs] def max(self):
'''
Return the maximum of the values in the Series.
'''
f = _lift_to_pd(self._pd.max)
return f()
[docs] def sum(self):
'''
Return the sum of the values in the Series.
'''
f = _lift_to_pd(self._pd.sum)
return f()
[docs] def abs(self):
'''
Return a Series with absolute numeric value of each element.
'''
f = _lift_to_pd(self._pd.abs)
return f()
# Arithmetic
def __add__(self, other):
f = _lift_to_pd(self._pd.__add__)
return f(other)
def __radd__(self, other):
f = _lift_to_pd(self._pd.__radd__)
return f(other)
def __mul__(self, other):
f = _lift_to_pd(self._pd.__mul__)
return f(other)
def __rmul__(self, other):
f = _lift_to_pd(self._pd.__rmul__)
return f(other)
def __pow__(self, other):
f = _lift_to_pd(self._pd.__pow__)
return f(other)
def __sub__(self, other):
f = _lift_to_pd(self._pd.__sub__)
return f(other)
def __rsub__(self, other):
f = _lift_to_pd(self._pd.__rsub__)
return f(other)
def __neg__(self):
f = _lift_to_pd(self._pd.__neg__)
return f()
def __truediv__(self, other):
f = _lift_to_pd(self._pd.__truediv__)
return f(other)
def __mod__(self, other):
f = _lift_to_pd(self._pd.__mod__)
return f(other)
# comparison
def __eq__(self, other):
f = _lift_to_pd(self._pd.__eq__)
return f(other)
def __ne__(self, other):
f = _lift_to_pd(self._pd.__ne__)
return f(other)
def __gt__(self, other):
f = _lift_to_pd(self._pd.__gt__)
return f(other)
def __lt__(self, other):
f = _lift_to_pd(self._pd.__lt__)
return f(other)
def __ge__(self, other):
f = _lift_to_pd(self._pd.__ge__)
return f(other)
def __le__(self, other):
f = _lift_to_pd(self._pd.__le__)
return f(other)
# bitwise operators
def __and__(self, other):
f = _lift_to_pd(self._pd.__and__)
return f(other)
def __or__(self, other):
f = _lift_to_pd(self._pd.__or__)
return f(other)
def __xor__(self, other):
f = _lift_to_pd(self._pd.__xor__)
return f(other)
# othe dunder methods
def __len__(self):
return self._pd.__len__()
def __invert__(self):
'''unary inversion, ~ operator'''
f = _lift_to_pd(self._pd.__invert__)
return f()
# array interface (for applying numpy functions)
def __array__(self, *vargs, **kwargs):
return self._pd.__array__(*vargs, **kwargs)
# return the underlying Series
def to_ser(self):
'''Return the underlying Pandas series'''
return self._pd
class DataFrameGroupBy(object):
'''
'''
def __init__(self, groupby):
# hidden pandas dataframe object
self._pd = groupby
# return the underlying groupby object
def to_gb(self):
'''return the underlying pandas groupby object'''
return self._pd
def aggregate(self, func):
if not callable(func):
raise Exception('Provide a function to aggregate')
return self._pd.aggregate(func)
# Calculations
[docs] def count(self):
'''
Compute count of group.
'''
f = _lift_to_pd(self._pd.count)
return f()
[docs] def mean(self):
'''
Compute mean of group.
'''
f = _lift_to_pd(self._pd.mean)
return f()
[docs] def min(self):
'''
Compute min of group.
'''
f = _lift_to_pd(self._pd.min)
return f()
[docs] def max(self):
'''
Compute max of group.
'''
f = _lift_to_pd(self._pd.max)
return f()
[docs] def sum(self):
'''
Compute sum of group.
'''
f = _lift_to_pd(self._pd.sum)
return f()
[docs] def size(self):
'''
Compute group sizes.
'''
f = _lift_to_pd(self._pd.size)
return f()
class DataFrameIndexer(object):
'''
Class lifts results of loc/iloc back to the custom DataFrame class.
'''
def __init__(self, indexer):
self.idx = indexer
def __getitem__(self, item):
# convert to pandas if item is baby-pandas object
try:
item = item._pd
except AttributeError:
pass
# TODO: restrict what item can be? (e.g. boolean array)
data = self.idx[item]
if isinstance(data, pd.DataFrame):
return DataFrame(data=data)
elif isinstance(data, pd.Series):
return Series(data=data)
else:
return data
def _lift_to_pd(func):
'''Checks output-type of function and if output is a
Pandas object, lifts the output to a babypandas class'''
if not callable(func):
return func
types = (DataFrame, DataFrameGroupBy, Series)
def closure(*vargs, **kwargs):
vargs = [x._pd if isinstance(x, types) else x for x in vargs]
kwargs = {k: x._pd if isinstance(x, types) else x
for (k, x) in kwargs.items()}
a = func(*vargs, **kwargs)
if isinstance(a, pd.DataFrame):
return DataFrame(data=a)
elif isinstance(a, pd.Series):
return Series(data=a)
elif isinstance(a, pd.core.groupby.generic.DataFrameGroupBy):
return DataFrameGroupBy(a)
else:
return a
closure.__doc__ = func.__doc__
return closure
def read_csv(filepath, **kwargs):
'''read_csv'''
df = pd.read_csv(filepath, **kwargs)
return DataFrame(data=df)
read_csv.__doc__ = pd.read_csv.__doc__