import numpy as np
import sklearn.linear_model
import sklearn.naive_bayes
import sklearn.tree
import sklearn.svm
import matplotlib.pyplot as plt
from scipy.stats import norm
sample_size=10000
neg_mu = -1
pos_mu = 1
c = 0.3
alpha = 0.5
model_types = [lambda : sklearn.linear_model.LogisticRegression(penalty="none"), sklearn.naive_bayes.GaussianNB, lambda:sklearn.tree.DecisionTreeClassifier(min_samples_leaf=int(sample_size/10))]#, lambda:sklearn.svm.SVC(kernel="linear", probability=True)]
tau = 0.5 # decision threshold
xmin,xmax = -5,5
x = np.linspace(xmin,xmax, 100000)
distr_x_if_pos = alpha * norm.pdf(x,pos_mu)
distr_x_if_neg = (1-alpha) * norm.pdf(x,neg_mu)
distr_x = distr_x_if_pos+distr_x_if_neg
distr_x_if_lab = distr_x_if_pos*c
distr_x_if_unl = distr_x - distr_x_if_lab
pr_pos_if_x = distr_x_if_pos/distr_x
pr_lab_if_x = distr_x_if_lab/distr_x
fig, (ax1,ax2) = plt.subplots(2, 1, sharex=True, figsize=(15,5),gridspec_kw={'height_ratios': [3, 1]})
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution",color='tab:blue')
ax1.plot(x,distr_x_if_neg, label="negative distribution",color='tab:red')
# ax1.plot(x,distr_x, label="total distribution")
ax1.set_yticks([])
ax1.set_xticks([])
ax1.set_ylim([0,0.3])
ax2.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$",color='tab:blue')
ax2.set_ylabel("$\Pr$")
ax2.set_ylim(0,1)
# fig.legend()
fig.tight_layout()
fig, (ax1,ax2) = plt.subplots(2, 1, sharex=True, figsize=(15,5),gridspec_kw={'height_ratios': [3, 1]})
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution",color='tab:blue')
ax1.plot(x,distr_x_if_neg, label="negative distribution", color='tab:red')
ax1.plot(x,distr_x_if_lab, label="labeled distribution",color='tab:blue', dashes=(5,5))
ax1.plot(x,distr_x_if_unl, label="unlabeled distribution", color='tab:grey', dashes=(5,5))
ax1.set_yticks([])
ax1.set_xticks([])
ax1.set_ylim([0,0.3])
ax2.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$",color='tab:blue')
ax2.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$",color='tab:blue', dashes=(5,5))
ax2.set_ylabel("$\Pr$")
ax2.set_ylim(0,1)
# fig.legend()
fig.tight_layout()
# sample training data from the distributions
neg_train = np.random.normal(neg_mu, size=sample_size//2)
pos_train = np.random.normal(pos_mu, size=sample_size//2)
n_lab = int(c*sample_size//2)
n_unl = sample_size-n_lab
lab_pos_train = pos_train[:n_lab]
unl_pos_train = pos_train[n_lab+1:]
x_train = np.concatenate((neg_train, pos_train)).reshape((-1,1))
y_train = np.array([0]*(sample_size//2)+[1]*(sample_size//2))
s_train = np.array([0]*n_unl + [1]*n_lab)
golden_probs_models = []
for model_type in model_types:
golden_probs_models += [model_type().fit(x_train,y_train).predict_proba(x.reshape((-1,1)))[:,1]]
fig, ax = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr(y=1|x)$")
# plt.scatter(x_train,y_train, label = "training data (fully labeled)")
plt.plot(x, pr_pos_if_x, label="True probabilities", linewidth=5, color="tab:orange")
for i,golden_probs_model in enumerate(golden_probs_models):
plt.plot(x,golden_probs_model, label=f"golden model {i}", color='tab:blue')
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.xlim(xmin,xmax)
plt.ylim(0,1)
ax.set_xticks([])
# plt.legend()
[]
Yes!
fig, ax1 = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$", color='tab:blue')
plt.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$", color='tab:blue', dashes=(20,20))
plt.plot(x,pr_lab_if_x/c, '--', label="True $\\frac{1}{c}\Pr(s=1|x)$",dashes=(20, 20), color='tab:red')
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
ax1.set_xticks([]);
ax1.set_ylim(0,1)
plt.xlim([-5,5])
# plt.legend()
(-5.0, 5.0)
fig, ax1 = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$", color='tab:blue')
plt.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$", color='tab:blue', dashes=(20,20))
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.hlines(c*tau, xmin,xmax, 'r', label="PU decision threshold")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
ax1.set_xticks([]);
ax1.set_ylim([0,1])
plt.xlim([-5,5])
# plt.legend()
(-5.0, 5.0)
Not necessarily
non_traditional_classifier_probs_models = []
for model_type in model_types:
non_traditional_classifier_probs_models += [model_type().fit(x_train,s_train).predict_proba(x.reshape((-1,1)))[:,1]]
scaled_non_traditional_classifier_probs = 1/c*non_traditional_classifier_probs_models[0]
fig, ax = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr(s=1|x)$")
# plt.scatter(x_train,y_train, label = "training data (fully labeled)")
plt.plot(x, pr_lab_if_x, label="True probabilities", linewidth=3, color="tab:orange")
for i,non_traditional_classifier_probs_model in enumerate(non_traditional_classifier_probs_models):
plt.plot(x,non_traditional_classifier_probs_model, label=f"non-traditional model {i}", color='tab:blue')
# plt.plot(x,non_traditional_classifier_probs_models[-1], label=f"non-traditional model {i}", color='tab:green', linewidth=5)
plt.hlines(tau*c, xmin,xmax, 'k', label="decision threshold")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.xlim(xmin,xmax)
plt.ylim(0,.4)
ax.set_xticks([])
# plt.ylim(0,1)
plt.hlines(c,xmin,xmax,'tab:grey');
# plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
# plt.legend()
fig, ax = plt.subplots(figsize=(15,10.5))
# plt.scatter(x_train,s_train)
plt.plot(x,non_traditional_classifier_probs_models[0], label="Non traditional classifier $Pr(s=1|x)", color='tab:orange')
plt.plot(x,scaled_non_traditional_classifier_probs, label="Scaled non traditional classifier $\Pr(y=1|x)", color='tab:orange', dashes=(20,20))
plt.plot(x, pr_pos_if_x, label="True $Pr(y=1|x)$", color='tab:blue')
plt.plot(x, pr_lab_if_x, label="True $Pr(s=1|x)$", color='tab:blue', dashes=(20,20))
# plt.hlines(0.5, xmin,xmax, 'k', label="decision threshold")
plt.hlines(1, -5,5, 'k', linewidth=1)
# plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.ylim([0,1.5])
plt.xlim([-5,5])
ax.set_xticks([])
ax.set_ylabel("$\Pr$")
# plt.legend()
Text(0, 0.5, '$\\Pr$')
tau_pu = tau*c
It works in theory (with correct $\Pr(s=1|x)$)
fig, ax1 = plt.subplots(figsize=(15,7))
# plt.scatter(x_train,s_train)
plt.plot(x,non_traditional_classifier_probs_models[0], label="Non traditional classifier $Pr(s=1|x)", color='tab:orange')
plt.plot(x, pr_pos_if_x, label="True $Pr(y=1|x)$", color='tab:blue')
plt.plot(x, pr_lab_if_x, label="True $Pr(s=1|x)$", color='tab:blue', dashes=(20,20))
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.hlines(tau_pu, xmin,xmax, 'tab:red', label="PU decision threshold")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
ax1.set_xticks([]);
plt.ylim([0,1])
plt.xlim([-5,5])
# plt.legend()
(-5.0, 5.0)
But what if non-traditional classifier does not predict correct probabilities? Then this also does not help. Biased twoards the negative class because there were more negative examples during training.
w_neg = 1/(2-c)
w_pos = 1/c
The probabilities are not correct, but the decision threshold is expected to be correct
distr_x_if_lab_weighted = distr_x_if_lab * w_pos
distr_x_if_unl_weighted = distr_x_if_unl * w_neg
distr_x_weighted = distr_x_if_lab_weighted+distr_x_if_unl_weighted
pr_lab_if_x_weighted = distr_x_if_lab_weighted/distr_x_weighted
fig, (ax1,ax2) = plt.subplots(2, 1, sharex=True, figsize=(15,8),gridspec_kw={'height_ratios': [5, 2]})
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution",color='tab:blue')
ax1.plot(x,distr_x_if_neg, label="negative distribution", color='tab:red')
ax1.plot(x,distr_x_if_lab, label="labeled distribution",color='tab:blue', dashes=(5,5))
ax1.plot(x,distr_x_if_unl, label="unlabeled distribution", color='tab:grey', dashes=(5,5))
# scale distributions for visibility
ax1.plot(x,distr_x_if_lab_weighted*0.99, label="weighted labeled distribution",color='tab:blue', dashes=(4,10))
ax1.plot(x,distr_x_if_unl_weighted*0.99, label="weighted unlabeled distribution", color='tab:grey', dashes=(4,10))
ax1.set_yticks([])
ax1.set_xticks([])
ax1.set_ylim([0,0.3])
ax1.set_xlim([xmin,xmax])
ax2.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$",color='tab:blue')
ax2.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$",color='tab:blue', dashes=(5,5))
ax2.plot(x,pr_lab_if_x_weighted, label="Per class-weighted $\Pr(s=1|x)$",color='tab:blue', dashes=(4,10))
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'tab:grey', label="decision threshold")
ax2.set_ylabel("$\Pr$")
ax2.set_ylim(0,1)
# fig.legend()
fig.tight_layout()
Again, our model is not able to predict the correct probabilities and is biased towards the negative examples, because that space was more clearly negative. So now, not only the probabilities are incorrect, but also the decision threshold.
weights = (s_train==0)*w_neg + (s_train==1)*w_pos
weighted_probs_models = []
for model_type in model_types:
weighted_probs_models += [model_type().fit(x_train,s_train, sample_weight=weights).predict_proba(x.reshape((-1,1)))[:,1]]
scaled_non_traditional_classifier_probs = 1/c*non_traditional_classifier_probs_models[0]
fig, ax = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr(y=1|x)$")
plt.plot(x, pr_pos_if_x, label="True probabilities", linewidth=3, color='orange')
plt.plot(x,pr_lab_if_x_weighted, label="Per class-weighted $\Pr(s=1|x)$",color='tab:orange')
for i,prs in enumerate(weighted_probs_models):
plt.plot(x,prs, label=f"model from class-weighted instances {i}", color='tab:blue')
# plt.plot(x,weighted_probs_models[-1], label=f"model from class-weighted instances {i}", color='tab:green', linewidth=5)
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.ylim(0,1)
plt.xlim(xmin,xmax)
ax.set_xticks([])
[]
Yes!
pr_pos_if_unl = (1-c)/c * pr_lab_if_x/(1-pr_lab_if_x)
distr_x_if_unl_pos = pr_pos_if_unl * distr_x_if_unl
distr_x_if_unl_neg = (1- pr_pos_if_unl) * distr_x_if_unl
fig, (ax1,ax2) = plt.subplots(2, 1, sharex=True, figsize=(15,8),gridspec_kw={'height_ratios': [5, 2]})
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution",color='tab:blue', linewidth=3)
ax1.plot(x,distr_x_if_neg, label="negative distribution", color='tab:red', linewidth=3)
ax1.plot(x,distr_x_if_lab, label="labeled distribution",color='tab:blue', dashes=(5,5))
ax1.plot(x,distr_x_if_unl, label="unlabeled distribution", color='tab:grey', dashes=(5,5))
ax1.plot(x,distr_x_if_lab + distr_x_if_unl_pos, label="EN instance weighted positive distribution",color='y', dashes=(20,20))
ax1.plot(x,distr_x_if_unl_neg, label="EN instance weighted positive distribution", color='y', dashes=(20,20))
ax1.set_yticks([])
ax1.set_xticks([])
ax1.set_ylim([0,0.3])
ax1.set_xlim([xmin,xmax])
ax2.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$",color='tab:blue')
ax2.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$",color='tab:blue', dashes=(5,5))
ax2.plot(x,pr_pos_if_x, label="EN instance weighted $\Pr(y=1|x)$",color='y', dashes=(20,20))
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'tab:grey', label="decision threshold")
ax2.set_ylabel("$\Pr$")
ax2.set_ylim(0,1)
# fig.legend()
fig.tight_layout()
Again we have the problem of the bias in learning the non-traditional classifier, which prevents us from correctly weighting the instances. Additionally, our non-traditional classifier learned probabilities $\Pr(s=1|x)>c$, which results in probabiliteis $\Pr(y=1|s=0,x)>1$ which is impossible, so we need to cut them off at 1.
This together makes it not work as well in practice as in theory. But still, the resulting model is better then the class-weighted instances, which we also expected from the theory.
x_train_lab = x_train[s_train==1]
x_train_unl = x_train[s_train==0]
EN_weighted_probs_models = []
for model_type in model_types:
train_s_probs_unl = model_type().fit(x_train, s_train).predict_proba(x_train[s_train==0])[:,1]
pr_pos_if_unl_train = (1-c)/c * train_s_probs_unl/(1-train_s_probs_unl)
pr_pos_if_unl_train[pr_pos_if_unl_train>1] = 1
x_train_EN = np.concatenate((x_train_lab, x_train_unl, x_train_unl)) # duplicate unlabeled examples
y_train_EN = np.concatenate((np.ones(n_lab+n_unl), np.zeros(n_unl)))
weights_EN = np.concatenate((np.ones(n_lab), pr_pos_if_unl_train, 1-pr_pos_if_unl_train ))
EN_weighted_probs_models += [model_type().fit(x_train_EN, y_train_EN, sample_weight=weights_EN).predict_proba(x.reshape((-1,1)))[:,1]]
train_s_probs_unl = model_types[2]().fit(x_train, s_train).predict_proba(x_train[s_train==0])[:,1]
pr_pos_if_unl_train = (1-c)/c * train_s_probs_unl/(1-train_s_probs_unl)
pr_pos_if_unl_train[pr_pos_if_unl_train>1] = 1
x_train_EN = np.concatenate((x_train_lab, x_train_unl, x_train_unl)) # duplicate unlabeled examples
y_train_EN = np.concatenate((np.ones(n_lab+n_unl), np.zeros(n_unl)))
weights_EN = np.concatenate((np.ones(n_lab), pr_pos_if_unl_train, 1-pr_pos_if_unl_train ))
EN_DT_weighted_probs_models = []
for model_type in model_types:
EN_DT_weighted_probs_models += [model_type().fit(x_train_EN, y_train_EN, sample_weight=weights_EN).predict_proba(x.reshape((-1,1)))[:,1]]
fig, ax = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr(y=1|x)$")
plt.plot(x, pr_pos_if_x, label="True probabilities", linewidth=4, color='orange')
for i,prs in enumerate(EN_weighted_probs_models):
plt.plot(x,prs, label=f"EN instance weighted {i}", color='tab:blue', linewidth=2)
for i,prs in enumerate(EN_DT_weighted_probs_models):
plt.plot(x,prs, label=f"EN instance weighted {i}", color='tab:purple', linewidth=1)
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.ylim(0,1)
plt.xlim(xmin,xmax)
ax.set_xticks([])
[]
Yes!
distr_x_pos_rescaling_lab = 1/c * distr_x_if_lab
distr_x_neg_total_minus_pos = distr_x - distr_x_pos_rescaling_lab
fig, (ax1,ax2) = plt.subplots(2, 1, sharex=True, figsize=(15,8),gridspec_kw={'height_ratios': [5, 2]})
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution",color='tab:blue', linewidth=3)
ax1.plot(x,distr_x_if_neg, label="negative distribution", color='tab:red', linewidth=3)
ax1.plot(x,distr_x_if_lab, label="labeled distribution",color='tab:blue', dashes=(5,5))
ax1.plot(x,distr_x_if_unl, label="unlabeled distribution", color='tab:grey', dashes=(5,5))
ax1.plot(x,distr_x_pos_rescaling_lab, label="ERM instance weighted positive distribution",color='y', dashes=(20,20))
ax1.plot(x,distr_x_neg_total_minus_pos, label="ERM instance weighted positive distribution", color='y', dashes=(20,20))
ax1.set_yticks([])
ax1.set_xticks([])
ax1.set_ylim([0,0.3])
ax1.set_xlim([xmin,xmax])
ax2.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$",color='tab:blue')
ax2.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$",color='tab:blue', dashes=(5,5))
ax2.plot(x,pr_pos_if_x, label="EN instance weighted $\Pr(y=1|x)$",color='y', dashes=(20,20))
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'tab:grey', label="decision threshold")
ax2.set_ylabel("$\Pr$")
ax2.set_ylim(0,1)
# fig.legend()
fig.tight_layout()
Yes! But your model needs to be able to handle negative weights, and this is not always the case. ==> Method modification possible to handle edge cases
x_train_lab = x_train[s_train==1]
x_train_unl = x_train[s_train==0]
x_train_ERM = np.concatenate((x_train_lab, x_train_lab, x_train_unl)) # duplicate labeled examples
y_train_ERM = np.concatenate((np.ones(n_lab), np.zeros(n_lab+n_unl)))
weights_ERM = np.concatenate((np.ones(n_lab)*1/c, np.ones(n_lab)*(1-1/c),np.ones(n_unl)))
ERM_weighted_probs_models = []
for model_type in model_types:
ERM_weighted_probs_models += [model_type().fit(x_train_ERM,y_train_ERM, sample_weight=weights_ERM).predict_proba(x.reshape((-1,1)))[:,1]]
fig, ax = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr(y=1|x)$")
plt.plot(x, pr_pos_if_x, label="True probabilities", linewidth=5, color='orange')
for i,prs in enumerate(ERM_weighted_probs_models):
plt.plot(x,prs, label=f"ERM instance weighted {i}", color='tab:blue', linewidth=2)
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.ylim(0,1)
plt.xlim(xmin,xmax)
ax.set_xticks([]);
i=0
fig, ax = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr(y=1|x)$")
plt.plot(x, pr_pos_if_x, label="True probabilities", linewidth=5, color='orange')
plt.plot(x,ERM_weighted_probs_models[i], label=f"ERM instance weighted {i}", color='tab:blue')
plt.plot(x,EN_weighted_probs_models[i], label=f"EN instance weighted {i}", color='tab:purple')
plt.plot(x,weighted_probs_models[i], label=f"class-weighted {i}", color='tab:cyan')
plt.plot(x,non_traditional_classifier_probs_models[i]/c, '--', label=f"class-weighted {i}", color='tab:green')
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.ylim(0,1)
plt.xlim(xmin,xmax)
ax.set_xticks([]);
i=1
fig, ax = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr(y=1|x)$")
plt.plot(x, pr_pos_if_x, label="True probabilities", linewidth=5, color='orange')
plt.plot(x,ERM_weighted_probs_models[i], label=f"ERM instance weighted {i}", color='tab:blue')
plt.plot(x,EN_weighted_probs_models[i], label=f"EN instance weighted {i}", color='tab:purple')
plt.plot(x,weighted_probs_models[i], label=f"class-weighted {i}", color='tab:cyan')
plt.plot(x,non_traditional_classifier_probs_models[i]/c, '--', label=f"class-weighted {i}", color='tab:green')
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.ylim(0,1)
plt.xlim(xmin,xmax)
ax.set_xticks([]);
i=2
fig, ax = plt.subplots(figsize=(15,7))
plt.ylabel("$\Pr(y=1|x)$")
plt.plot(x, pr_pos_if_x, label="True probabilities", linewidth=5, color='orange')
plt.plot(x,ERM_weighted_probs_models[i], label=f"ERM instance weighted {i}", color='tab:blue')
plt.plot(x,EN_weighted_probs_models[i], label=f"EN instance weighted {i}", color='tab:purple')
plt.plot(x,weighted_probs_models[i], label=f"class-weighted {i}", color='tab:cyan')
plt.plot(x,non_traditional_classifier_probs_models[i]/c, '--', label=f"class-weighted {i}", color='tab:green')
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.ylim(0,1)
plt.xlim(xmin,xmax)
ax.set_xticks([]);