Uplift modelling¶
An complex example of how heterogeneous treatment effects are used in ecommerce platforms. 🚀🚀🚀
The business problem:
We are concerted with giving or not giving users a 5% discount voucher.
If we give it to everyone, we'll likely get more bookings, but at the same time we hurt our margins. So the best way to proceed is to give the coupons only to those customers that were unsure about the decision and that will actually convert after receiving it. Also, not giving the coupons to customers that were already going to book something no matter what!
The causal problem:
Our treatment $T$ is offering customers this coupon, a binary variable.
Our outcome $Y$ is the customer finalizing their booking or not, also a binary variable.
Additionally, we have access to a number of other variables $X$ characterizing the user, the channel that brough them to the platform, the specific ongoing session, and so on... We believe some of them could be valuable in deciding the effectiveness of the intervention. In other words, the effect isn't uniform but varies depending on some of these factors.
import warnings
from typing import Dict, Tuple
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from uplift_modelling_utils import data_generating_process
np.random.seed(111)
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
sns.plotting_context("notebook")
Collect data and check basic properties¶
The data_generating_process
function returns the data
itself, which can be used to train models. It also returns metadata
, which is NOT available in practice, but which we can explore here to assess model performance exactly.
We draw 60000 samples and see that there seems to be a difference in the booking probability when offering customers a coupon.
# Observational (targeted / unbalanced)
data, metadata = data_generating_process(60000, setting="observational")
data.head()
client_id | Y | T | channel | device | lead_time | funnel_depth | price_sort_used | past_coupon_user | tenure_days | dest_tier | hour_local | recent_ad_exposure | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | desktop | 14 | 0 | 0 | 1 | 231 | tier2 | 6 | 0 | |
1 | 1 | 0 | 1 | paid_search | mobile_web | 5 | 1 | 0 | 1 | 1238 | tier2 | 15 | 0 |
2 | 2 | 0 | 1 | paid_search | desktop | 90 | 1 | 1 | 0 | 1498 | tier1 | 11 | 0 |
3 | 3 | 0 | 1 | paid_search | mobile_web | 30 | 2 | 1 | 1 | 138 | tier1 | 9 | 0 |
4 | 4 | 0 | 1 | paid_search | desktop | 3 | 2 | 1 | 0 | 1672 | tier1 | 3 | 0 |
# Quick sanity checks
print("Dataset:")
print(" n =", len(data))
print(" treat_rate =", data["T"].mean().round(4))
print(" conv_rate_T1 =", data.loc[data["T"] == 1, "Y"].mean().round(4))
print(" conv_rate_T0 =", data.loc[data["T"] == 0, "Y"].mean().round(4))
print(" ATE (naive) =", (data.loc[data["T"] == 1, "Y"].mean() - data.loc[data["T"] == 0, "Y"].mean()).round(4))
Dataset: n = 60000 treat_rate = 0.5733 conv_rate_T1 = 0.178 conv_rate_T0 = 0.1132 ATE (naive) = 0.0648
# One hot encode "channel" and "device"
data = pd.get_dummies(data, columns=["channel", "device"], drop_first=True, dtype=int)
# Ordinal encode "dest_tier"
data.loc[:, "dest_tier"] = data["dest_tier"].map({"tier1": 1, "tier2": 2, "tier3": 3})
data
client_id | Y | T | lead_time | funnel_depth | price_sort_used | past_coupon_user | tenure_days | dest_tier | hour_local | recent_ad_exposure | channel_direct | channel_email | channel_organic | channel_paid_search | device_desktop | device_mobile_web | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 14 | 0 | 0 | 1 | 231 | 2 | 6 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
1 | 1 | 0 | 1 | 5 | 1 | 0 | 1 | 1238 | 2 | 15 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
2 | 2 | 0 | 1 | 90 | 1 | 1 | 0 | 1498 | 1 | 11 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
3 | 3 | 0 | 1 | 30 | 2 | 1 | 1 | 138 | 1 | 9 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
4 | 4 | 0 | 1 | 3 | 2 | 1 | 0 | 1672 | 1 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
59995 | 59995 | 0 | 1 | 21 | 0 | 0 | 1 | 1578 | 3 | 17 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
59996 | 59996 | 0 | 1 | 60 | 1 | 1 | 0 | 577 | 1 | 20 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
59997 | 59997 | 0 | 0 | 5 | 1 | 1 | 0 | 1076 | 1 | 13 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
59998 | 59998 | 0 | 1 | 7 | 0 | 1 | 0 | 489 | 1 | 11 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
59999 | 59999 | 0 | 0 | 10 | 0 | 0 | 1 | 1032 | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
60000 rows × 17 columns
def train_test_split_df(df: pd.DataFrame, features: list, test_size=0.25, seed=123):
x = df[features].values
y = df["Y"].astype(float).values
t = df["T"].astype(float).values
return train_test_split(x, y, t, test_size=test_size, random_state=seed, stratify=t)
X_train, X_test, y_train, y_test, t_train, t_test = train_test_split_df(
data,
features=[col for col in data.columns if col not in ["client_id", "Y", "T"]],
test_size=0.25,
)
class TLearner:
"""A simple T-Learner implementation."""
def __init__(self, model_factory=lambda: GradientBoostingClassifier(random_state=7)) -> None:
"""Specify the model class (factory) to be used for the two models."""
self.m1 = model_factory()
self.m0 = model_factory()
def fit(self, x: np.ndarray, t: np.ndarray, y: np.ndarray) -> None:
"""Fit the control and treatment models."""
self.m1.fit(x[t == 1], y[t == 1])
self.m0.fit(x[t == 0], y[t == 0])
def predict_uplift(self, x: np.ndarray) -> np.ndarray:
"""Predict the uplift (treatment effect) for each sample in x."""
p1 = self.m1.predict_proba(x)[:, 1]
p0 = self.m0.predict_proba(x)[:, 1]
return p1 - p0
# T-learner
t_learner = TLearner()
t_learner.fit(X_train, t_train, y_train)
class XLearner:
"""A simple X-Learner implementation."""
def __init__(
self,
clf_factory=lambda: GradientBoostingClassifier(random_state=7),
reg_factory=lambda: GradientBoostingRegressor(random_state=7),
prop_factory=lambda: LogisticRegression(max_iter=300),
) -> None:
"""Specify the model classes (factories) to be used."""
self.m1 = clf_factory()
self.m0 = clf_factory()
self.g1 = reg_factory()
self.g0 = reg_factory()
self.prop = prop_factory()
def fit(self, x: np.ndarray, y: np.ndarray, t: np.ndarray) -> None:
"""Fit the two outcome models, the two treatment effect models, and the propensity model."""
self.m1.fit(x[t == 1], y[t == 1])
self.m0.fit(x[t == 0], y[t == 0])
p0_cf = self.m0.predict_proba(x)[:, 1]
p1_cf = self.m1.predict_proba(x)[:, 1]
d1 = y - p0_cf # valid where t==1
d0 = p1_cf - y # valid where t==0
self.g1.fit(x[t == 1], d1[t == 1])
self.g0.fit(x[t == 0], d0[t == 0])
self.prop.fit(x, t)
def predict_uplift(self, x: np.ndarray) -> np.ndarray:
"""Predict the uplift for a given set of features."""
e = np.clip(self.prop.predict_proba(x)[:, 1], 1e-3, 1 - 1e-3)
tau1 = self.g1.predict(x)
tau0 = self.g0.predict(x)
return e * tau0 + (1.0 - e) * tau1
# Example with RCT data (balanced groups)
# from your generator:
# rct_df = generate_lodging_uplift_data(60_000, "rct", seed=12)
# obs_df = generate_lodging_uplift_data(60_000, "observational", seed=11)
# Replace with one of your dataframes:
df = rct_df # or obs_df
X_tr, X_te, y_tr, y_te, t_tr, t_te = train_test_split_df(df)
# T-learner
m1, m0 = fit_t_learner(X_tr, y_tr, t_tr)
tau_hat_T = predict_uplift_t_learner(m1, m0, X_te)
# X-learner
prop_tr = fit_propensity(X_tr, t_tr) if "p_treat" not in df.columns else None
x_models = fit_x_learner(X_tr, y_tr, t_tr, prop_model=prop_tr)
tau_hat_X = predict_uplift_x_learner(x_models, X_te)
# DR-learner
dr_models = fit_dr_learner(X_tr, y_tr, t_tr, prop_model=prop_tr)
tau_hat_DR = predict_uplift_dr_learner(dr_models, X_te)
# Evaluate with policy curves
test_df = pd.DataFrame({
"Y": y_te, "T": t_te,
"p_treat": df.loc[X_te.index, "p_treat"].values if "p_treat" in df.columns else 0.5
})
use_ipw = not np.allclose(test_df["p_treat"].values, 0.5) # Observational -> IPW; RCT -> diff-in-means
curve_T = policy_value_curve(test_df, tau_hat_T, use_ipw=use_ipw)
curve_X = policy_value_curve(test_df, tau_hat_X, use_ipw=use_ipw)
curve_DR = policy_value_curve(test_df, tau_hat_DR, use_ipw=use_ipw)
print("Top-k AUUC (T, X, DR):",
curve_T["AUUC"].iloc[-1].round(4),
curve_X["AUUC"].iloc[-1].round(4),
curve_DR["AUUC"].iloc[-1].round(4))
# Treating the top 30% example (simulate policy value)
top_k = 0.30
k_idx = int(np.floor(top_k * len(test_df)))
order = np.argsort(-tau_hat_DR) # by DR uplift
seg = test_df.iloc[order[:k_idx]]
if use_ipw:
e = np.clip(seg["p_treat"].values, 1e-3, 1-1e-3)
y = seg["Y"].values; t = seg["T"].values
mu_t = np.sum((t/e)*y)/np.sum(t/e)
mu_c = np.sum(((1-t)/(1-e))*y)/np.sum((1-t)/(1-e))
print("Top-30% uplift per user (IPW):", (mu_t - mu_c).round(4))
else:
print("Top-30% uplift per user (RCT diff-in-means):",
(seg.loc[seg["T"]==1,"Y"].mean() - seg.loc[seg["T"]==0,"Y"].mean()).round(4))
Conclusions¶
TBW