Source code for abacus.auto_ab.variance_reduction

from typing import List
import statsmodels.api as sm
from category_encoders.target_encoder import TargetEncoder
from abacus.types import ArrayNumType, ColumnNameType, ColumnNamesType, DataFrameType


[docs]class VarianceReduction:
    """Implementation of sensitivity increasing approaches.

    As it is easier to apply variance reduction techniques directly to experiment, all approaches should be called on ``ABTest`` class instance.

    Example:

    .. code-block:: python

        from abacus.auto_ab.abtest import ABTest
        from abacus.auto_ab.params import ABTestParams, DataParams, HypothesisParams

        data_params = DataParams(...)
        hypothesis_params = HypothesisParams(...)
        ab_params = ABTestParams(data_params, hypothesis_params)

        df = pd.read_csv('data.csv')
        ab_test = ABTest(df, ab_params)
        ab_test = ab_test.cuped()
    """

    def __init__(self) -> None:
        pass

    @staticmethod
    def _target_encoding(
        x: DataFrameType, encoding_columns: ColumnNamesType, target_column: str
    ) -> DataFrameType:
        """Encodes target column."""
        for col in x[encoding_columns].select_dtypes(include="O").columns:
            te = TargetEncoder()
            x[col] = te.fit_transform(x[col], x[target_column])
        return x

    @staticmethod
    def _predict_target(
        x: DataFrameType,
        target_prev_col: ColumnNameType,
        factors_prev_cols: ColumnNamesType,
        factors_now_cols: ColumnNamesType,
    ) -> ArrayNumType:
        """Covariate prediction with linear regression model.

        Args:
            x (pandas.DataFrame): Pandas DataFrame.
            target_prev_col (str): Target on previous period column name.
            factors_prev_cols (List[str]): Factor columns for modelling.
            factors_now_cols (List[str]): Factor columns for prediction on current period.

        Returns:
            pandas.Series: Pandas Series with predicted values
        """
        y = x[target_prev_col]
        x_train = x[factors_prev_cols]
        model = sm.OLS(y, x_train)
        results = model.fit()

        print(results.summary())
        x_predict = x[factors_now_cols]

        return model.predict(x_predict)

    @classmethod
    def cupac(
        cls,
        x: DataFrameType,
        target_prev_col: ColumnNameType,
        target_now_col: ColumnNameType,
        factors_prev_cols: ColumnNamesType,
        factors_now_cols: ColumnNamesType,
        groups_col: ColumnNameType,
    ) -> DataFrameType:
        """Perform CUPED on target variable with covariate calculated
        as a prediction from a linear regression model.

        Original paper: https://doordash.engineering/2020/06/08/improving-experimental-power-through-control-using-predictions-as-covariate-cupac/.

        Args:
            x (pandas.DataFrame): Pandas DataFrame for analysis.
            target_prev_col (str): Target on previous period column name.
            target_now_col (str): Target on current period column name.
            factors_prev_cols (List[str]): Factor columns for modelling.
            factors_now_cols (List[str]): Factor columns for prediction on current period.
            groups_col (str): Groups column name.

        Returns:
            pandas.DataFrame: Pandas DataFrame with additional columns: target_pred and target_now_cuped
        """
        x = cls._target_encoding(
            x, list(set(factors_prev_cols + factors_now_cols)), target_prev_col
        )
        x.loc[:, "target_pred"] = cls._predict_target(
            x, target_prev_col, factors_prev_cols, factors_now_cols
        )
        x_new = cls.cuped(x, target_now_col, groups_col, "target_pred")
        return x_new

    @classmethod
    def cuped(
        cls,
        df: DataFrameType,
        target_col: ColumnNameType,
        groups_col: ColumnNameType,
        covariate_col: ColumnNameType,
    ) -> DataFrameType:
        """Perform CUPED on target variable with predefined covariate.

        Covariate has to be chosen with regard to the following restrictions:

        1. Covariate is independent of an experiment.
        2. Covariate is highly correlated with target variable.
        3. Covariate is continuous variable.

        Original paper: https://exp-platform.com/Documents/2013-02-CUPED-ImprovingSensitivityOfControlledExperiments.pdf.

        Args:
            df (pandas.DataFrame): Pandas DataFrame for analysis.
            target_col (str): Target column name.
            groups_col (str): Groups A and B column name.
            covariate_col (str): Covariate column name. If None, then most correlated column in considered as covariate.

        Returns:
            pandas.DataFrame: Pandas DataFrame with additional target CUPEDed column
        """
        x = df.copy()

        cov = x[[target_col, covariate_col]].cov().loc[target_col, covariate_col]
        var = x[covariate_col].var()
        theta = cov / var

        for group in x[groups_col].unique():
            x_subdf = x[x[groups_col] == group]
            group_y_cuped = x_subdf[target_col] - theta * (
                x_subdf[covariate_col] - x_subdf[covariate_col].mean()
            )
            x.loc[x[groups_col] == group, target_col] = group_y_cuped

        return x