Source code for abacus.auto_ab.variance_reduction

from typing import List
import statsmodels.api as sm
from category_encoders.target_encoder import TargetEncoder
from abacus.types import ArrayNumType, ColumnNameType, ColumnNamesType, DataFrameType


[docs]class VarianceReduction: """Implementation of sensitivity increasing approaches. As it is easier to apply variance reduction techniques directly to experiment, all approaches should be called on ``ABTest`` class instance. Example: .. code-block:: python from abacus.auto_ab.abtest import ABTest from abacus.auto_ab.params import ABTestParams, DataParams, HypothesisParams data_params = DataParams(...) hypothesis_params = HypothesisParams(...) ab_params = ABTestParams(data_params, hypothesis_params) df = pd.read_csv('data.csv') ab_test = ABTest(df, ab_params) ab_test = ab_test.cuped() """ def __init__(self) -> None: pass @staticmethod def _target_encoding( x: DataFrameType, encoding_columns: ColumnNamesType, target_column: str ) -> DataFrameType: """Encodes target column.""" for col in x[encoding_columns].select_dtypes(include="O").columns: te = TargetEncoder() x[col] = te.fit_transform(x[col], x[target_column]) return x @staticmethod def _predict_target( x: DataFrameType, target_prev_col: ColumnNameType, factors_prev_cols: ColumnNamesType, factors_now_cols: ColumnNamesType, ) -> ArrayNumType: """Covariate prediction with linear regression model. Args: x (pandas.DataFrame): Pandas DataFrame. target_prev_col (str): Target on previous period column name. factors_prev_cols (List[str]): Factor columns for modelling. factors_now_cols (List[str]): Factor columns for prediction on current period. Returns: pandas.Series: Pandas Series with predicted values """ y = x[target_prev_col] x_train = x[factors_prev_cols] model = sm.OLS(y, x_train) results = model.fit() print(results.summary()) x_predict = x[factors_now_cols] return model.predict(x_predict) @classmethod def cupac( cls, x: DataFrameType, target_prev_col: ColumnNameType, target_now_col: ColumnNameType, factors_prev_cols: ColumnNamesType, factors_now_cols: ColumnNamesType, groups_col: ColumnNameType, ) -> DataFrameType: """Perform CUPED on target variable with covariate calculated as a prediction from a linear regression model. Original paper: https://doordash.engineering/2020/06/08/improving-experimental-power-through-control-using-predictions-as-covariate-cupac/. Args: x (pandas.DataFrame): Pandas DataFrame for analysis. target_prev_col (str): Target on previous period column name. target_now_col (str): Target on current period column name. factors_prev_cols (List[str]): Factor columns for modelling. factors_now_cols (List[str]): Factor columns for prediction on current period. groups_col (str): Groups column name. Returns: pandas.DataFrame: Pandas DataFrame with additional columns: target_pred and target_now_cuped """ x = cls._target_encoding( x, list(set(factors_prev_cols + factors_now_cols)), target_prev_col ) x.loc[:, "target_pred"] = cls._predict_target( x, target_prev_col, factors_prev_cols, factors_now_cols ) x_new = cls.cuped(x, target_now_col, groups_col, "target_pred") return x_new @classmethod def cuped( cls, df: DataFrameType, target_col: ColumnNameType, groups_col: ColumnNameType, covariate_col: ColumnNameType, ) -> DataFrameType: """Perform CUPED on target variable with predefined covariate. Covariate has to be chosen with regard to the following restrictions: 1. Covariate is independent of an experiment. 2. Covariate is highly correlated with target variable. 3. Covariate is continuous variable. Original paper: https://exp-platform.com/Documents/2013-02-CUPED-ImprovingSensitivityOfControlledExperiments.pdf. Args: df (pandas.DataFrame): Pandas DataFrame for analysis. target_col (str): Target column name. groups_col (str): Groups A and B column name. covariate_col (str): Covariate column name. If None, then most correlated column in considered as covariate. Returns: pandas.DataFrame: Pandas DataFrame with additional target CUPEDed column """ x = df.copy() cov = x[[target_col, covariate_col]].cov().loc[target_col, covariate_col] var = x[covariate_col].var() theta = cov / var for group in x[groups_col].unique(): x_subdf = x[x[groups_col] == group] group_y_cuped = x_subdf[target_col] - theta * ( x_subdf[covariate_col] - x_subdf[covariate_col].mean() ) x.loc[x[groups_col] == group, target_col] = group_y_cuped return x