Source code for abacus.auto_ab.abtest

from __future__ import annotations
from typing import Optional, Tuple, Dict, Any
import copy
import warnings
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from scipy.stats import mannwhitneyu, ttest_ind, shapiro, mode, t, chisquare, norm
from statsmodels.stats.proportion import proportions_ztest
from abacus.auto_ab.graphics import Graphics
from abacus.auto_ab.variance_reduction import VarianceReduction
from abacus.auto_ab.params import ABTestParams
from abacus.resplitter.resplit_builder import ResplitBuilder
from abacus.resplitter.params import ResplitParams
from abacus.types import ArrayNumType, DataFrameType, StatTestResultType

warnings.simplefilter("always")


[docs] class ABTest: """Performs different calculations of A/B-test. - Results evaluation for different metric types (continuous, binary, ratio). - Bucketing (decrease number of points, normal distribution of metric of interest) Example: .. code-block:: python from abacus.auto_ab.abtest import ABTest from abacus.auto_ab.params import ABTestParams, DataParams, HypothesisParams data_params = DataParams(...) hypothesis_params = HypothesisParams(...) ab_params = ABTestParams(data_params, hypothesis_params) df = pd.read_csv('data.csv') ab_test = ABTest(df, ab_params) ab_test.test_welch() # {'stat': 5.172, 'p-value': 0.312, 'result': 0} """ def __init__(self, dataset: Optional[DataFrameType], params: ABTestParams) -> None: self.params = params self.__dataset = dataset if ( dataset is not None and len(self.params.data_params.transforms) == 0 and self.params.hypothesis_params.metric_type in ("continuous", "binary") ): self.__check_required_columns(dataset, "init") self.params.data_params.control = self.__get_group( self.params.data_params.control_name, self.dataset ) self.params.data_params.treatment = self.__get_group( self.params.data_params.treatment_name, self.dataset ) @property def dataset(self) -> DataFrameType: return self.__dataset def __str__(self) -> str: return """ ABTest(alpha={alpha}, beta={beta}, alternative={alternative}, metric type={metric_type}, metric_name={metric_name}) """.format( alpha=self.params.hypothesis_params.alpha, beta=self.params.hypothesis_params.beta, alternative=self.params.hypothesis_params.alternative, metric_type=self.params.hypothesis_params.metric_type, metric_name=self.params.hypothesis_params.metric_name, ) def __check_applied_transformation(self, method: str) -> None: if method in self.params.data_params.transforms: warnings.warn( f"Method `{method}` has already been called before. " f"It will be applied again, but you should check whether it is needed twice." ) def __check_required_metric_type(self, method: str) -> None: available_metric_methods = { "continuous": [ "report_continuous", "cuped", "cupac", "bucketing", "filter_outliers", "metric_transform", "test_boot_fp", "test_boot_welch", "test_boot_confint", "test_welch", "test_mannwhitney", "test_buckets", "linearization", ], "binary": [ "report_binary", "test_boot_confint", "test_boot_fp", "test_z_proportions", "test_chisquare", ], "ratio": [ "report_ratio", "linearization", "manual_ttest", "test_boot_ratio", "test_delta_ratio", "test_taylor_ratio", ], } incorrect_metric_type = "" if method in available_metric_methods["continuous"]: incorrect_metric_type = "continuous" elif method in available_metric_methods["binary"]: incorrect_metric_type = "binary" elif method in available_metric_methods["ratio"]: incorrect_metric_type = "ratio" if ( method not in available_metric_methods[self.params.hypothesis_params.metric_type] ): raise ValueError( "Incorrect metric type: '{incorrect_metric_type}' required, but '{current_metric_type}' provided".format( incorrect_metric_type=incorrect_metric_type, current_metric_type=self.params.hypothesis_params.metric_type, ) ) def __check_required_columns(self, df: DataFrameType, method: str) -> None: """Check presence of columns in dataframe. Args: df (pandas.DataFrame): DataFrame to check. method (str): Stage of A/B process which you'd like to test. Raises: ValueError: If `is_valid_col` is False. Experiment cannot be provided if required columns are absent. """ cols: Dict[str, str] = {} if method == "init": cols = { "id_col": self.params.data_params.id_col, "group_col": self.params.data_params.group_col, } if self.params.hypothesis_params.metric_type in ["continuous", "binary"]: cols["target"] = self.params.data_params.target elif self.params.hypothesis_params.metric_type == "ratio": cols["numerator"] = self.params.data_params.numerator cols["denominator"] = self.params.data_params.denominator elif method == "cuped": cols = {"covariate": self.params.data_params.covariate} elif method == "cupac": cols = { "predictors_prev": self.params.data_params.predictors_prev, "predictors_now": self.params.data_params.predictors_now, "target_prev": self.params.data_params.target_prev, } elif method == "resplit_df": cols = {"strata_col": self.params.data_params.strata_col} not_correct_fields = [] df_cols = df.columns for field, value in cols.items(): if value == "" or value not in df_cols: not_correct_fields.append(field) if len(not_correct_fields) > 0: raise ValueError( f"You did not provide or provide incorrectly following data attributes: {not_correct_fields}" ) def __get_group( self, group_label: str, df: Optional[DataFrameType] = None ) -> np.ndarray: """Gets target metric column based on desired group label. Args: group_label (str): Group label, e.g. 'A', 'B'. df (DataFrameType, optional): DataFrame to query from. Returns: numpy.ndarray: Target column for a desired group. """ x = df if df is not None else self.__dataset group = np.array([]) if self.params.hypothesis_params.metric_type in ["continuous", "binary"]: group = x.loc[ x[self.params.data_params.group_col] == group_label, self.params.data_params.target, ].to_numpy() return group def __bucketize(self, x: ArrayNumType) -> np.ndarray: """Split array ``x`` into N non-overlapping buckets. There are two purposes for these actions: 1. Decrease number of data points of experiment. 2. Get normal distribution of a metric of interest. Procedure: 1. Shuffle elements of an array. 2. Split points into N non-overlapping buckets. 3. On every bucket calculate metric of interest. Args: x (np.ndarray): Array to split. Returns: np.ndarray: Splitted into buckets array. """ np.random.shuffle(x) x_new = np.array( [ self.params.hypothesis_params.metric(x_) for x_ in np.array_split(x, self.params.hypothesis_params.n_buckets) ] ) return x_new def __manual_ttest( self, ctrl_mean: float, ctrl_var: float, ctrl_size: int, treat_mean: float, treat_var: float, treat_size: int, ) -> StatTestResultType: """Performs Welch's t-test based on aggregation of metrics instead of datasets. For empirical calculation of T-statistic we need: expectation, variance, array size for each group. Args: ctrl_mean (float): Mean of control group. ctrl_var (float): Variance of control group. ctrl_size (int): Size of control group. treat_mean (float): Mean of treatment group. treat_var (float): Variance of treatment group. treat_size (int): Size of treatment group. Returns: stat_test_typing: Dictionary with following properties: test statistic, p-value, test result. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("manual_ttest") t_stat_empirical = (treat_mean - ctrl_mean) / ( ctrl_var / ctrl_size + treat_var / treat_size ) ** (1 / 2) df = (ctrl_var / ctrl_size + treat_var / treat_size) ** 2 / ( ctrl_var**2 / (ctrl_size**2 * (ctrl_size - 1)) + (treat_var**2 / (treat_size**2 * (treat_size - 1))) ) test_result: int = 0 if self.params.hypothesis_params.alternative == "two-sided": lcv, rcv = t.ppf( self.params.hypothesis_params.alpha / 2, df=df, loc=ctrl_mean, scale=np.sqrt(ctrl_var), ), t.ppf( 1.0 - self.params.hypothesis_params.alpha / 2, df=df, loc=ctrl_mean, scale=np.sqrt(ctrl_var), ) if not (lcv < t_stat_empirical < rcv): test_result = 1 elif self.params.hypothesis_params.alternative == "less": lcv = t.ppf( self.params.hypothesis_params.alpha, df=df, loc=ctrl_mean, scale=np.sqrt(ctrl_var), ) if t_stat_empirical < lcv: test_result = 1 elif self.params.hypothesis_params.alternative == "greater": rcv = t.ppf( 1 - self.params.hypothesis_params.alpha, df=df, loc=ctrl_mean, scale=np.sqrt(ctrl_var), ) if t_stat_empirical > rcv: test_result = 1 result = { "stat": np.round(t_stat_empirical, 5), "p-value": None, "result": test_result, } return result def __delta_params(self, x: DataFrameType) -> Tuple[float, float]: """Calculated expectation and variance for ratio metric using delta approximation. Source: https://arxiv.org/pdf/1803.06336.pdf. Args: x (pandas.DataFrame): Pandas DataFrame of particular group (A, B, etc). Returns: Tuple[float, float]: Mean and variance of ratio metric. """ num = x[self.params.data_params.numerator] den = x[self.params.data_params.denominator] num_mean, den_mean = num.mean(), den.mean() num_var, den_var = num.var(), den.var() cov = ( x[[self.params.data_params.numerator, self.params.data_params.denominator]] .cov() .iloc[0, 1] ) n = len(num) bias_correction = (den_mean / num_mean**3) * (num_var / n) - cov / ( n * num_mean**2 ) mean = den_mean / num_mean - 1 + bias_correction var = ( den_var / num_mean**2 - 2 * (den_mean / num_mean**3) * cov + (den_mean**2 / num_mean**4) * num_var ) return mean, var def __taylor_params(self, x: DataFrameType) -> Tuple[float, float]: """Calculated expectation and variance for ratio metric using Taylor expansion approximation. Source: https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. Args: x (pandas.DataFrame): Pandas DataFrame of particular group (A, B, etc). Returns: Tuple[float, float]: Mean and variance of ratio metric. """ num = x[self.params.data_params.numerator] den = x[self.params.data_params.denominator] mean = ( num.mean() / den.mean() - x[ [self.params.data_params.numerator, self.params.data_params.denominator] ] .cov() .iloc[0, 1] / (den.mean() ** 2) + den.var() * num.mean() / (den.mean() ** 3) ) var = ( (num.mean() ** 2) / (den.mean() ** 2) * ( num.var() / (num.mean() ** 2) - 2 * x[ [ self.params.data_params.numerator, self.params.data_params.denominator, ] ] .cov() .iloc[0, 1] ) / (num.mean() * den.mean() + den.var() / (den.mean() ** 2)) ) return mean, var def __report_binary(self) -> Tuple[str, Dict[str, Any]]: self.__check_required_metric_type("report_binary") hypothesis = self.params.hypothesis_params ctrl = self.params.data_params.control trtm = self.params.data_params.treatment ztest = self.test_z_proportions() ztest_res = "H0 is not rejected" if ztest["result"] == 0 else "H0 is rejected" try: # chi-square works well chisq = self.test_chisquare() chisq_res = ( "H0 is not rejected" if chisq["result"] == 0 else "H0 is rejected" ) chisq_result = f"- Chi-square - test: {chisq['stat']: .2f}, p - value = {chisq['p-value']: .4f}, {chisq_res}." test_result = chisq["result"] + ztest["result"] num_of_tests = 2 except ValueError: chisq_result = "" test_result = ztest["result"] num_of_tests = 1 test_explanation = ( f"{test_result} out of {num_of_tests} stat.test show that H0 is rejected." ) transforms: ArrayNumType = self.params.data_params.transforms if len(transforms) > 0: transforms_str = "Transformations applied: " + " -> ".join(transforms) + "." else: transforms_str = "No transformations applied." params = { "ztest_stat": ztest["stat"], "ztest_pvalue": ztest["p-value"], "ztest_result": ztest_res, "chi_square": chisq_result, "ctrl_conv": sum(ctrl) / len(ctrl), "trtm_conv": sum(trtm) / len(trtm), "ctrl_obs": len(ctrl), "trtm_obs": len(trtm), "alpha": hypothesis.alpha, "beta": hypothesis.beta, "alternative": hypothesis.alternative, "metric_name": hypothesis.metric_name, "transforms": transforms_str, "test_explanation": test_explanation, } output = """ Parameters of experiment: - Metric type: binary. - Metric: {metric_name}. - Errors: alpha = {alpha}, beta = {beta}. - Alternative: {alternative}. {transforms} Following statistical tests are used: - Z-test: {ztest_stat:.2f}, p-value = {ztest_pvalue:.4f}, {ztest_result}. {chi_square} {test_explanation} Please note that you should carefully use the results of different statistical procedures and do not consider all of them at once. Statistics of experiment groups. Control group: - Observations: {ctrl_obs} - Conversion: {ctrl_conv} Treatment group: - Observations: {trtm_obs} - Conversion: {trtm_conv} """.format( **params ) return output, params def __report_continuous(self) -> Tuple[str, Dict[str, Any]]: self.__check_required_metric_type("report_continuous") hypothesis = self.params.hypothesis_params ctrl = self.params.data_params.control trtm = self.params.data_params.treatment welch = self.test_welch() welch_res = "H0 is not rejected" if welch["result"] == 0 else "H0 is rejected" mwu = self.test_mannwhitney() mwu_res = "H0 is not rejected" if mwu["result"] == 0 else "H0 is rejected" boot = self.test_boot_confint() boot_res = "H0 is not rejected" if boot["result"] == 0 else "H0 is rejected" test_result = welch["result"] + mwu["result"] + boot["result"] test_explanation = "" if test_result == 3: test_explanation = "All three stat. tests showed that H0 is rejected." elif test_result == 2: test_explanation = ( "Two out of three stat. tests showed that H0 is rejected." ) elif test_result == 1: test_explanation = ( "Two out of three stat. tests showed that H0 is not rejected." ) elif test_result == 0: test_explanation = "All three stat. tests showed that H0 is not rejected." bucketing_str = "" if "bucketing" in self.params.data_params.transforms: bucketing_str = f"Number of buckets: {hypothesis.n_buckets}.\n" metric_transform_str = "" if "metric transform" in self.params.data_params.transforms: metric_transform_str = f"Metric transformation applied: {hypothesis.metric_transform.__name__}.\n" filter_outliers_str = "" if "filter outliers" in self.params.data_params.transforms: filter_outliers_str = ( f"Outliers filtering method applied: {hypothesis.filter_method}.\n" ) transforms: ArrayNumType = self.params.data_params.transforms if len(transforms) > 0: transforms_str = ( "Transformations applied: " + " -> ".join(transforms) + ".\n" ) else: transforms_str = "No transformations applied.\n" params = { "welch_stat": welch["stat"], "welch_pvalue": welch["p-value"], "welch_result": welch_res, "mwu_stat": mwu["stat"], "mwu_pvalue": mwu["p-value"], "mwu_result": mwu_res, "boot_result": boot_res, "ctrl_obs": len(ctrl), "trtm_obs": len(trtm), "ctrl_mean": np.mean(ctrl), "ctrl_median": np.median(ctrl), "ctrl_25th": np.quantile(ctrl, 0.25), "ctrl_75th": np.quantile(ctrl, 0.75), "ctrl_min": np.min(ctrl), "ctrl_max": np.max(ctrl), "ctrl_std": np.std(trtm), "ctrl_var": np.var(trtm), "trtm_mean": np.mean(trtm), "trtm_median": np.median(trtm), "trtm_25th": np.quantile(trtm, 0.25), "trtm_75th": np.quantile(trtm, 0.75), "trtm_min": np.min(trtm), "trtm_max": np.max(trtm), "trtm_std": np.std(trtm), "trtm_var": np.var(trtm), "alpha": hypothesis.alpha, "beta": hypothesis.beta, "alternative": hypothesis.alternative, "metric_name": hypothesis.metric_name, "bucketing_str": bucketing_str, "transforms": transforms_str, "metric_transform_str": metric_transform_str, "filter_outliers_str": filter_outliers_str, "n_boot_samples": hypothesis.n_boot_samples, "test_explanation": test_explanation, } output = """ Parameters of experiment: - Metric type: continuous. - Metric: {metric_name}. - Errors: alpha = {alpha}, beta = {beta}. - Alternative: {alternative}. {transforms} Number of bootstrap iterations: {n_boot_samples}.\n{bucketing_str}{metric_transform_str}{filter_outliers_str} Following statistical tests are used: - Welch's t-test: {welch_stat:.2f}, p-value = {welch_pvalue:.4f}, {welch_result}. - Mann Whitney's U-test: {mwu_stat:.2f}, p-value = {mwu_pvalue:.4f}, {mwu_result}. - Bootstrap test: {boot_result}. {test_explanation} Please note that you should carefully use the results of different statistical procedures and do not consider all of them at once. Statistics of experiment groups. Control group: - Observations: {ctrl_obs} - Mean: {ctrl_mean:.4f} - Median: {ctrl_median:.4f} - 25th quantile: {ctrl_25th:.4f} - 75th quantile: {ctrl_75th:.4f} - Minimum: {ctrl_min:.4f} - Maximum: {ctrl_max:.4f} - St.deviation: {ctrl_std:.4f} - Variance: {ctrl_var:.4f} Treatment group: - Observations: {trtm_obs} - Mean: {trtm_mean:.4f} - Median: {trtm_median:.4f} - 25th quantile: {trtm_25th:.4f} - 75th quantile: {trtm_75th:.4f} - Minimum: {trtm_min:.4f} - Maximum: {trtm_max:.4f} - St.deviation: {trtm_std:.4f} - Variance: {trtm_var:.4f} """.format( **params ) return output, params def __report_ratio(self): raise NotImplementedError("Reporting for ratio metric is still in progress..") def bucketing(self) -> ABTest: """Performs bucketing in order to accelerate results computation. Returns: ABTest: New instance of ``ABTest`` class with modified control and treatment. """ self.__check_applied_transformation("bucketing") self.__check_required_metric_type("bucketing") params_new = copy.deepcopy(self.params) params_new.data_params.control = self.__bucketize( self.params.data_params.control ) params_new.data_params.treatment = self.__bucketize( self.params.data_params.treatment ) params_new.data_params.transforms = np.append( params_new.data_params.transforms, "bucketing" ) return ABTest(None, params_new) def cuped(self) -> ABTest: """Performs CUPED for variance reduction. Returns: ABTest: New instance of ``ABTest`` class with modified control and treatment. """ self.__check_applied_transformation("cuped") self.__check_required_metric_type("cuped") self.__check_required_columns(self.__dataset, "cuped") result_df = VarianceReduction.cuped( self.__dataset, target_col=self.params.data_params.target, groups_col=self.params.data_params.group_col, covariate_col=self.params.data_params.covariate, ) params_new = copy.deepcopy(self.params) params_new.data_params.control = self.__get_group( self.params.data_params.control_name, result_df ) params_new.data_params.treatment = self.__get_group( self.params.data_params.treatment_name, result_df ) params_new.data_params.transforms = np.append( params_new.data_params.transforms, "cuped" ) return ABTest(result_df, params_new) def cupac(self) -> ABTest: """Performs CUPAC for variance reduction. Returns: ABTest: New instance of ``ABTest`` class with modified control and treatment. """ self.__check_applied_transformation("cupac") self.__check_required_metric_type("cupac") self.__check_required_columns(self.__dataset, "cupac") result_df = VarianceReduction.cupac( self.__dataset, target_prev_col=self.params.data_params.target_prev, target_now_col=self.params.data_params.target, factors_prev_cols=self.params.data_params.predictors_prev, factors_now_cols=self.params.data_params.predictors_now, groups_col=self.params.data_params.group_col, ) params_new = copy.deepcopy(self.params) params_new.data_params.control = self.__get_group( self.params.data_params.control_name, result_df ) params_new.data_params.treatment = self.__get_group( self.params.data_params.treatment_name, result_df ) params_new.data_params.transforms = np.append( params_new.data_params.transforms, "cupac" ) return ABTest(result_df, params_new) def filter_outliers(self) -> ABTest: self.__check_applied_transformation("filter_outliers") self.__check_required_metric_type("filter_outliers") target = self.__dataset[[self.params.data_params.target]].values dataset_new = self.__dataset.copy() if self.params.hypothesis_params.filter_method == "isolation_forest": not_outlier_index = IsolationForest(random_state=0).fit_predict(target) == 1 dataset_new = self.__dataset.loc[not_outlier_index].reset_index(drop=True) if self.params.hypothesis_params.filter_method == "top_5": quantile95 = np.quantile(target, 0.95) not_outlier_index = ( self.__dataset[self.params.data_params.target] <= quantile95 ) dataset_new = self.__dataset.loc[not_outlier_index].reset_index(drop=True) params_new = copy.deepcopy(self.params) params_new.data_params.transforms = np.append( params_new.data_params.transforms, "filter outliers" ) params_new.data_params.control = self.__get_group( self.params.data_params.control_name, dataset_new ) params_new.data_params.treatment = self.__get_group( self.params.data_params.treatment_name, dataset_new ) return ABTest(dataset_new, params_new) def linearization(self) -> ABTest: """Creates linearized continuous metric based on ratio-metric. Important: there is an assumption that all data is already grouped by user s.t. numerator for user = sum of numerators for user for different time periods and denominator for user = sum of denominators for user for different time periods Source: https://research.yandex.com/publications/148. """ self.__check_applied_transformation("linearization") self.__check_required_metric_type("linearization") if self.params.data_params.is_grouped: return ABTest(self.__dataset, self.params) dataset_new = copy.deepcopy(self.__dataset) params_new = copy.deepcopy(self.params) num_col, den_col = "num", "den" if self.params.hypothesis_params.metric_type == "ratio": numerator_col_name = self.params.data_params.numerator denominator_col_name = self.params.data_params.denominator df_grouped = ( self.__dataset.groupby( by=[ self.params.data_params.id_col, self.params.data_params.group_col, ] ) .agg({numerator_col_name: "sum", denominator_col_name: "sum"}) .rename( columns={numerator_col_name: num_col, denominator_col_name: den_col} ) .reset_index() ) self.__dataset = df_grouped elif self.params.hypothesis_params.metric_type == "continuous": df_grouped = ( self.__dataset.groupby( by=[ self.params.data_params.id_col, self.params.data_params.group_col, ], as_index=False, )[self.params.data_params.target] .agg(["sum", "count"]) .rename(columns={"sum": num_col, "count": den_col}) .reset_index() ) self.__dataset = df_grouped ctrl = self.__dataset.loc[ self.__dataset[self.params.data_params.group_col] == self.params.data_params.control_name ] k = round(sum(ctrl[num_col]) / sum(ctrl[den_col]), 5) new_target_name = "target_linearized" self.__dataset.loc[:, new_target_name] = ( self.__dataset[num_col] - k * self.__dataset[den_col] ) dataset_new = dataset_new.merge( self.__dataset[[self.params.data_params.id_col, new_target_name]], how="left", on=self.params.data_params.id_col, ) dataset_new = dataset_new.drop_duplicates( subset=[self.params.data_params.id_col] ) params_new.data_params.target = new_target_name params_new.data_params.control = dataset_new.loc[ dataset_new[self.params.data_params.group_col] == self.params.data_params.control_name, new_target_name, ].to_numpy() params_new.data_params.treatment = dataset_new.loc[ dataset_new[self.params.data_params.group_col] == self.params.data_params.treatment_name, new_target_name, ].to_numpy() params_new.data_params.transforms = np.append( params_new.data_params.transforms, "linearization" ) params_new.hypothesis_params.metric_type = "continuous" return ABTest(dataset_new, params_new) def metric_transform(self) -> ABTest: self.__check_applied_transformation("metric_transform") self.__check_required_metric_type("metric_transform") if self.params.hypothesis_params.metric_transform is None: return ABTest(self.__dataset, self.params) dataset_new = copy.deepcopy(self.__dataset) target = self.params.data_params.target group_col = self.params.data_params.group_col transform = self.params.hypothesis_params.metric_transform control_name = self.params.data_params.control_name control_flg = dataset_new[group_col] == control_name dataset_new.loc[control_flg, target] = transform( dataset_new.loc[control_flg, target].to_numpy() ) treatment_name = self.params.data_params.treatment_name treatment_flg = dataset_new[group_col] == treatment_name dataset_new.loc[treatment_flg, target] = transform( dataset_new.loc[treatment_flg, target].to_numpy() ) params_new = copy.deepcopy(self.params) params_new.data_params.transforms = np.append( params_new.data_params.transforms, "metric transform" ) params_new.data_params.control = transform( dataset_new.loc[control_flg, target].to_numpy() ) params_new.data_params.treatment = transform( dataset_new.loc[treatment_flg, target].to_numpy() ) return ABTest(dataset_new, params_new) def plot(self, kind: str = "experiment", save_path: Optional[str] = None) -> None: """Plot experiment. Args: kind (str): Kind of plot: 'experiment', 'bootstrap'. save_path (str, optional): Path where to save image. Raises: ValueError: If `kind` is not in ['experiment', 'bootstrap']. """ if kind not in ["experiment", "bootstrap"]: raise ValueError( "`kind` parameter supports only the following values: 'experiment', 'bootstrap'" ) if kind == "experiment": if self.params.hypothesis_params.metric_type == "continuous": Graphics.plot_continuous_experiment(self.params, save_path) if self.params.hypothesis_params.metric_type == "binary": Graphics.plot_binary_experiment(self.params, save_path) elif kind == "bootstrap" and self.params.hypothesis_params.metric_type in [ "continuous", "binary", ]: Graphics.plot_bootstrap_confint(self.params, save_path) def report(self) -> Dict[str, Any]: report_output = "Report for ratio metric currently not supported." report_params = {} if self.params.hypothesis_params.metric_type == "continuous": report_output, report_params = self.__report_continuous() if self.params.hypothesis_params.metric_type == "binary": report_output, report_params = self.__report_binary() print(report_output) return report_params def resplit_df(self) -> ABTest: """Resplit dataframe. Returns: ABTest: Instance of ``ABTest`` class with modified control and treatment. """ resplit_params = ResplitParams( group_col=self.params.data_params.group_col, strata_col=self.params.data_params.strata_col, ) resplitter = ResplitBuilder(self.__dataset, resplit_params) new_dataset = resplitter.collect() return ABTest(new_dataset, self.params) def test_boot_fp(self) -> StatTestResultType: """Performs bootstrap hypothesis testing by calculation of false positives. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_boot_fp") x = self.params.data_params.control y = self.params.data_params.treatment metric_diffs: ArrayNumType = [] for _ in range(self.params.hypothesis_params.n_boot_samples): x_boot = np.random.choice(x, size=x.shape[0], replace=True) y_boot = np.random.choice(y, size=y.shape[0], replace=True) metric_diffs.append( self.params.hypothesis_params.metric(y_boot) - self.params.hypothesis_params.metric(x_boot) ) pd_metric_diffs = pd.DataFrame(metric_diffs) left_quant = self.params.hypothesis_params.alpha / 2 right_quant = 1 - self.params.hypothesis_params.alpha / 2 ci = pd_metric_diffs.quantile([left_quant, right_quant]) ci_left, ci_right = float(ci.iloc[0]), float(ci.iloc[1]) criticals = [0, 0] for boot in pd_metric_diffs: if boot < 0 and boot < ci_left: criticals[0] += 1 elif boot > 0 and boot > ci_right: criticals[1] += 1 false_positive = min(criticals) / pd_metric_diffs.shape[0] test_result: int = 0 # 0 - cannot reject H0, 1 - reject H0 if false_positive <= self.params.hypothesis_params.alpha: test_result = 1 result = { "stat": None, "p-value": np.round(false_positive, 5), "result": test_result, } return result def test_boot_confint(self) -> StatTestResultType: """Performs bootstrap confidence interval and zero statistical significance. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_boot_confint") x = self.params.data_params.control y = self.params.data_params.treatment metric_diffs: ArrayNumType = [] for _ in range(self.params.hypothesis_params.n_boot_samples): x_boot = np.random.choice(x, size=x.shape[0], replace=True) y_boot = np.random.choice(y, size=y.shape[0], replace=True) metric_diffs.append( self.params.hypothesis_params.metric(y_boot) - self.params.hypothesis_params.metric(x_boot) ) pd_metric_diffs = pd.DataFrame(metric_diffs) boot_mean = pd_metric_diffs.mean() boot_std = pd_metric_diffs.std() test_result: int = 0 # 0 - cannot reject H0, 1 - reject H0 if self.params.hypothesis_params.alternative == "two-sided": left_quant = self.params.hypothesis_params.alpha / 2 right_quant = 1 - self.params.hypothesis_params.alpha / 2 ci = pd_metric_diffs.quantile([left_quant, right_quant]) ci_left, ci_right = float(ci.iloc[0]), float(ci.iloc[1]) one_sided_pvalue = norm.cdf(0, loc=boot_mean, scale=boot_std)[0] zero_pvalue = min(one_sided_pvalue, 1 - one_sided_pvalue) if ci_left > 0 or ci_right < 0: # 0 is not in critical area test_result = 1 elif self.params.hypothesis_params.alternative == "less": left_quant = self.params.hypothesis_params.alpha ci = pd_metric_diffs.quantile([left_quant]) ci_left = float(ci.iloc[0]) zero_pvalue = norm.cdf(0, loc=boot_mean, scale=boot_std)[0] if ci_left < 0: # 0 is not is critical area test_result = 1 elif self.params.hypothesis_params.alternative == "greater": right_quant = self.params.hypothesis_params.alpha ci = pd_metric_diffs.quantile([right_quant]) ci_right = float(ci.iloc[0]) zero_pvalue = 1 - norm.cdf(0, loc=boot_mean, scale=boot_std)[0] if 0 < ci_right: # 0 is not in critical area test_result = 1 result = { "stat": None, "p-value": np.round(zero_pvalue, 5), "result": test_result, } return result def test_boot_ratio(self) -> StatTestResultType: """Performs bootstrap for ratio-metric. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_boot_ratio") x = self.__dataset[ self.__dataset[self.params.data_params.group_col] == self.params.data_params.control_name ] y = self.__dataset[ self.__dataset[self.params.data_params.group_col] == self.params.data_params.treatment_name ] a_metric_total = sum(x[self.params.data_params.numerator]) / sum( x[self.params.data_params.denominator] ) b_metric_total = sum(y[self.params.data_params.numerator]) / sum( y[self.params.data_params.denominator] ) origin_mean = b_metric_total - a_metric_total boot_diffs = [] boot_a_metric = [] boot_b_metric = [] for _ in range(self.params.hypothesis_params.n_boot_samples): a_ids = x[self.params.data_params.id_col].sample( x[self.params.data_params.id_col].nunique(), replace=True ) b_ids = y[self.params.data_params.id_col].sample( y[self.params.data_params.id_col].nunique(), replace=True ) a_boot = x[x[self.params.data_params.id_col].isin(a_ids)] b_boot = y[y[self.params.data_params.id_col].isin(b_ids)] a_boot_metric = sum(a_boot[self.params.data_params.numerator]) / sum( a_boot[self.params.data_params.denominator] ) b_boot_metric = sum(b_boot[self.params.data_params.numerator]) / sum( b_boot[self.params.data_params.denominator] ) boot_a_metric.append(a_boot_metric) boot_b_metric.append(b_boot_metric) boot_diffs.append(b_boot_metric - a_boot_metric) # correction boot_mean = np.mean(boot_diffs) delta = abs(origin_mean - boot_mean) boot_diffs = [boot_diff + delta for boot_diff in boot_diffs] pd_metric_diffs = pd.DataFrame(boot_diffs) left_quant = self.params.hypothesis_params.alpha / 2 right_quant = 1 - self.params.hypothesis_params.alpha / 2 ci = pd_metric_diffs.quantile([left_quant, right_quant]) ci_left, ci_right = float(ci.iloc[0]), float(ci.iloc[1]) test_result: int = 0 # 0 - cannot reject H0, 1 - reject H0 if ( ci_left > 0 or ci_right < 0 ): # left border of ci > 0 or right border of ci < 0 test_result = 1 result = {"stat": None, "p-value": None, "result": test_result} return result def test_boot_welch(self) -> StatTestResultType: r"""Performs Welch's t-test for independent samples with unequal number of observations and variance. Welch's t-test is used as a wider approaches with fewer restrictions on samples size as in Student's t-test. Statistic of the test: .. math:: t = \frac{\hat{X}_1 - \hat{X}_2}{\sqrt{\frac{s_1}{\sqrt{N_1}} + \frac{s_2}{\sqrt{N_2}}}}. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_boot_welch") x = self.params.data_params.control y = self.params.data_params.treatment t_calc: int = 0 for _ in range(self.params.hypothesis_params.n_boot_samples): x_boot = np.random.choice(x, size=x.shape[0], replace=True) y_boot = np.random.choice(y, size=y.shape[0], replace=True) t_boot = (np.mean(x_boot) - np.mean(y_boot)) / ( np.var(x_boot) / x_boot.shape[0] + np.var(y_boot) / y_boot.shape[0] ) test_res = ttest_ind( y_boot, x_boot, equal_var=False, alternative=self.params.hypothesis_params.alternative, ) if t_boot >= test_res[1]: t_calc += 1 pvalue = t_calc / self.params.hypothesis_params.n_boot_samples test_result: int = 0 # 0 - cannot reject H0, 1 - reject H0 if pvalue <= self.params.hypothesis_params.alpha: test_result = 1 result = {"stat": None, "p-value": np.round(pvalue, 5), "result": test_result} return result def test_buckets(self) -> StatTestResultType: """Performs buckets hypothesis testing. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_buckets") x = self.params.data_params.control y = self.params.data_params.treatment np.random.shuffle(x) np.random.shuffle(y) x_new = np.array( [ self.params.hypothesis_params.metric(x) for x in np.array_split(x, self.params.hypothesis_params.n_buckets) ] ) y_new = np.array( [ self.params.hypothesis_params.metric(y) for y in np.array_split(y, self.params.hypothesis_params.n_buckets) ] ) test_result: int = 0 if (shapiro(x_new).pvalue >= self.params.hypothesis_params.alpha) and ( shapiro(y_new).pvalue >= self.params.hypothesis_params.alpha ): stat, pvalue = ttest_ind( y_new, x_new, equal_var=False, alternative=self.params.hypothesis_params.alternative, ) if pvalue <= self.params.hypothesis_params.alpha: test_result = 1 else: def metric(arr: np.array): modes, _ = mode(arr) return sum(modes) / len(modes) self.params.hypothesis_params.metric = metric _, pvalue, test_result = self.test_boot_confint() result = {"stat": None, "p-value": np.round(pvalue, 5), "result": test_result} return result def test_chisquare(self) -> StatTestResultType: """Performs Chi-Square test. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_chisquare") x = self.__get_group(self.params.data_params.control_name, self.dataset) y = self.__get_group(self.params.data_params.treatment_name, self.dataset) if len(x) == len(y): observed = np.array([sum(y), len(y) - sum(y)]) expected = np.array([sum(x), len(x) - sum(x)]) stat, pvalue = chisquare(observed, expected) test_result: int = 0 if pvalue <= self.params.hypothesis_params.alpha: test_result = 1 result = { "stat": np.round(stat, 5), "p-value": np.round(pvalue, 5), "result": test_result, } return result else: raise ValueError("Both groups have different lengths") def test_delta_ratio(self) -> StatTestResultType: """Delta method with bias correction for ratios. Source: https://arxiv.org/pdf/1803.06336.pdf. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_delta_ratio") x = self.__dataset[ self.__dataset[self.params.data_params.group_col] == self.params.data_params.control_name ] y = self.__dataset[ self.__dataset[self.params.data_params.group_col] == self.params.data_params.treatment_name ] ctrl_mean, ctrl_var = self.__delta_params(x) treat_mean, treat_var = self.__delta_params(y) return self.__manual_ttest( ctrl_mean, ctrl_var, x.shape[0], treat_mean, treat_var, y.shape[0] ) def test_mannwhitney(self) -> StatTestResultType: r"""Performs Mann-Whitney U test. Test works on continues metrics and their ranks. Assumptions of Mann-Whitney test: 1. Independence of observations. 2. Same shape of metric distributions. Statistic of the test: .. math:: U = \sum_{i=1}^{n} \sum_{j=1}^{m} S(X_i, Y_j). Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_mannwhitney") x = self.params.data_params.control y = self.params.data_params.treatment test_result: int = 0 stat, pvalue = mannwhitneyu( x, y, alternative=self.params.hypothesis_params.alternative ) if pvalue <= self.params.hypothesis_params.alpha: test_result = 1 result = { "stat": np.round(stat, 5), "p-value": np.round(pvalue, 5), "result": test_result, } return result def test_taylor_ratio(self) -> StatTestResultType: """Calculate expectation and variance of ratio for each group and then use t-test for hypothesis testing. Source: http://www.stat.cmu.edu/~hseltman/files/ratio.pdf. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_taylor_ratio") x = self.__dataset[ self.__dataset[self.params.data_params.group_col] == self.params.data_params.control_name ] y = self.__dataset[ self.__dataset[self.params.data_params.group_col] == self.params.data_params.treatment_name ] ctrl_mean, ctrl_var = self.__taylor_params(x) treat_mean, treat_var = self.__taylor_params(y) return self.__manual_ttest( ctrl_mean, ctrl_var, x.shape[0], treat_mean, treat_var, y.shape[0] ) def test_welch(self) -> StatTestResultType: """Performs Welch's t-test. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_welch") x = self.params.data_params.control y = self.params.data_params.treatment test_result: int = 0 stat, pvalue = ttest_ind( y, x, equal_var=False, alternative=self.params.hypothesis_params.alternative ) if pvalue <= self.params.hypothesis_params.alpha: test_result = 1 result = { "stat": np.round(stat, 5), "p-value": np.round(pvalue, 5), "result": test_result, } return result def test_z_proportions(self) -> StatTestResultType: r"""Performs z-test for proportions. The two-proportions z-test is used to compare two observed proportions. Statistic of the test: .. math:: Z = \frac{\hat{p}_1 - \hat{p}_2}{\sqrt{\hat{p}(1-\hat{p})(\frac{1}{n_1} + \frac{1}{n_2})}}. Returns: stat_test_typing: Dictionary with following properties: ``test statistic``, ``p-value``, ``test result``. Test result: 1 - significant different, 0 - insignificant difference. """ self.__check_required_metric_type("test_z_proportions") x = self.__get_group(self.params.data_params.control_name, self.dataset) y = self.__get_group(self.params.data_params.treatment_name, self.dataset) count = np.array([sum(y), sum(x)]) nobs = np.array([len(y), len(x)]) alternative = self.params.hypothesis_params.alternative if alternative != "two-sided": alternative = "smaller" if alternative == "less" else "larger" stat, pvalue = proportions_ztest(count, nobs, alternative=alternative) test_result: int = 0 if pvalue <= self.params.hypothesis_params.alpha: test_result = 1 result = { "stat": np.round(stat, 5), "p-value": np.round(pvalue, 5), "result": test_result, } return result