import pandas as pd
import numpy as np #NumPy library required by Pandas library
df = pd.read_csv(file_path, index_col=None)
df.columns
Series.name
df.shape
Series.shape
df.dtypes == 'object'
df.describe()
for col in df.columns: # for loop to get column names
df[col].unique() # returns unique values in column 'col'
df[col].nunique() # returns unique values in column AND drop NAs
df.filter(items=, like=None)
df.iloc[row_position, col_position] # general format
Note: Copy of dataframe is slice is returned. This CANNOT be used in assignment. Also, note that only integers, its derivations and boolean arrays can be used to index.
index = (df.Color=="Red") & (df.Item=="Shirt") # produces a boolean array
df.loc[index] # returns original dataframe sliced
df.loc[row_indexer, col_indexer] #general format
Note: colon ":" is not needed by col_indexer like in *df.loc[index, :]* to choose all columns, and slice of ORIGINAL dataframe is returned. This can be used for assignment.
df.query('expression') # expression must be conditional using column variable/s
# prefix "@" before variable name if outside of dataframe env.
# prefix ` ` to encapsulate variable name with spaces
Note: Can be used to filter rows.
df.apply(FUN, axis=0, *args) # FUN: any (valid) function to apply
# axis: axis to assess
# *args: additional keyword arguments for FUN
df.assign(col_4 = col_1*col_2/col_3)
Note: Used to create new columns. Similar to "mutate" in R.
df.get_dummies(dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=np.uint8)
# dummy_na: if True, adds dummy column for NA
# columns: if !None, columns specified will be one hot encoded
# sparse: whether resulting columns will be SparseArray objects (True) or NumPy arrays(False)
# drop_first: removes first unique dummy variable from unique objects in column
# dtype: data type for new columns (e.g. float)
Series.get_dummies(dummy_na=False, columns=None, sparse=False, drop_first=False, dtype='np.uint8')
df.groupby(by=category).transform(FUN)
Note: Grouping a dataframe then transforming it is a very common operation. A dataframe is converted into a "GroupBy" object depending on *category*, which may be a string or list of strings representing column names. Then the resulting object's values are passed into *FUN* by the transform function and a column of the same size is returned.
# Finding NA and inverse
df.isna()
Series.isna()
df.notna()
Series.isna()
# Dropping NA
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
# axis: axis to assess
# how: when to drop axis (e.g. how='all' drops IFF all values are NA)
# thresh: number of NAs in axis to drop
# subset: labels on axis to be considered
# inplace: by default, returns new object. If True, modifies existing object
Series.dropna(axis=0, how=None,inplace=False)
# Filling NA
df.fillna(value=None, method=None, axis=None, inplace=False, limit=None)
# value: value to fill holes
# method: method for filling holes.*
# inplace: by default, returns new object. If True, modifies existing object
# limit: max number of consecutive NAs to fill. Will be left as NA if value is exceeded
Series.fillna(value=None, method=None, axis=None, inplace=False, limit=None)
df.applymap(FUN)
Series.map(FUN)
Note: Iterates by rows (for Series) and cell (for DataFrame), passing cell value into function with the output value replacing its input value.
df.pipe(FUN1).pipe(FUN2, arg1=foo).pipe((FUN3, "arg2"), arg1=bar)
# when FUN3's main arg. is not df, supply tuple
where str contains location for df (e.g. "arg2")
)
NOTE: similar to %>% operation in R.
df.rolling(num_observations).FUN()
Series.rolling(num_observations).FUN()
# Example of counting observations every 7 days past
Series.rolling('7d').count()-1 # subtraction excludes day
Note: This is similar to GroupBy then Transform, but function is applied to "rolls" going down a specific column/index.