import numpy as np
import sklearn.linear_model
import matplotlib.pyplot as plt
from scipy.stats import norm, uniform
sample_size=100000
neg_start=-1
pos_start=0
class_width =1
c = 0.3
alpha = 0.5
ModelType = lambda : sklearn.linear_model.LogisticRegression(penalty="none",max_iter=50000)
tau = 0.5 # decision threshold
xmin,xmax = -1.1,1.1
x = np.linspace(xmin,xmax, 100000)
distr_x_if_pos = alpha * uniform.pdf(x,pos_start, class_width)
distr_x_if_neg = (1-alpha) * uniform.pdf(x,neg_start,class_width)
distr_x = distr_x_if_pos+distr_x_if_neg
distr_x_if_lab = distr_x_if_pos*c
distr_x_if_unl = distr_x - distr_x_if_lab
pr_pos_if_x = distr_x_if_pos/distr_x
pr_lab_if_x = distr_x_if_lab/distr_x
fig, ax1 = plt.subplots(figsize=(15,5))
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution")
ax1.plot(x,distr_x_if_neg, label="negative distribution")
# ax1.plot(x,distr_x, label="total distribution")
ax2 = ax1.twinx()
color = 'tab:pink'
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim([0,1])
ax2.set_ylabel('$\Pr(y=1|x)$', color=color) # we already handled the x-label with ax1
ax2.plot(x,pr_pos_if_x, color=color)
fig.legend()
fig.tight_layout()
fig, ax1 = plt.subplots(figsize=(15,5))
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_lab, label="labeled distribution")
ax1.plot(x,distr_x_if_unl, label="unlabeled distribution")
# ax1.plot(x,distr_x, label="total distribution")
ax2 = ax1.twinx()
color = 'tab:pink'
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim([0,1])
ax2.set_ylabel('$\Pr(s=1|x)$', color=color) # we already handled the x-label with ax1
ax2.plot(x,pr_lab_if_x, color=color)
fig.legend()
fig.tight_layout()
plt.figure(figsize=(15,5))
plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$")
plt.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$")
plt.legend()
<ipython-input-9-241bf70dbeea>:12: RuntimeWarning: invalid value encountered in true_divide pr_pos_if_x = distr_x_if_pos/distr_x <ipython-input-9-241bf70dbeea>:13: RuntimeWarning: invalid value encountered in true_divide pr_lab_if_x = distr_x_if_lab/distr_x
<matplotlib.legend.Legend at 0x7fe085125b20>
# sample training data from the distributions
neg_train = np.random.uniform(neg_start, neg_start+class_width, size=sample_size)
pos_train = np.random.uniform(pos_start, pos_start+class_width, size=sample_size)
n_lab = int(c*sample_size)
n_unl = sample_size*2-n_lab
lab_pos_train = pos_train[:n_lab]
unl_pos_train = pos_train[n_lab+1:]
x_train = np.concatenate((neg_train, pos_train)).reshape((-1,1))
y_train = np.array([0]*sample_size+[1]*sample_size)
s_train = np.array([0]*n_unl + [1]*n_lab)
golden_model = ModelType().fit(x_train,y_train)
golden_probs = golden_model.predict_proba(x.reshape((-1,1)))[:,1]
plt.figure(figsize=(15,5))
plt.ylabel("$\Pr(y=1|x)$")
plt.scatter(x_train,y_train, label = "training data (fully labeled)")
plt.plot(x,golden_probs, label="Golden model")
plt.plot(x, pr_pos_if_x, label="True probabilities", dashes=(20,20))
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()
<matplotlib.legend.Legend at 0x7fe0a152ca90>
Yes!
plt.figure(figsize=(15,5))
plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$")
plt.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$")
plt.plot(x,pr_lab_if_x/c, '--', label="True $\\frac{1}{c}\Pr(s=1|x)$",dashes=(20, 20), color='r')
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()
<matplotlib.legend.Legend at 0x7fe0a29bfee0>
Not necessarily
non_traditional_classifier = ModelType().fit(x_train,s_train)
non_traditional_classifier_probs = non_traditional_classifier.predict_proba(x.reshape((-1,1)))[:,1]
scaled_non_traditional_classifier_probs = 1/c*non_traditional_classifier_probs
plt.figure(figsize=(30,10))
plt.scatter(x_train,s_train)
plt.plot(x,non_traditional_classifier_probs, label="Non traditional classifier $Pr(s=1|x)")
plt.plot(x,scaled_non_traditional_classifier_probs, label="Scaled non traditional classifier $\Pr(y=1|x)")
plt.plot(x, pr_pos_if_x, label="True $Pr(y=1|x)$")
plt.plot(x, pr_lab_if_x, label="True $Pr(s=1|x)$")
# plt.hlines(0.5, xmin,xmax, 'k', label="decision threshold")
# plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.ylim([0,1.5])
plt.legend()
<matplotlib.legend.Legend at 0x7fe0a1386370>
tau_pu = tau*c
It works in theory (with correct $\Pr(s=1|x)$)
plt.figure(figsize=(30,10))
plt.scatter(x_train,s_train)
plt.plot(x, pr_pos_if_x, label="True $Pr(y=1|x)$")
plt.plot(x, pr_lab_if_x, label="True $Pr(s=1|x)$")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.hlines(tau_pu, xmin,xmax, 'tab:gray', label="PU decision threshold")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.ylim([0,1.5])
plt.legend()
<matplotlib.legend.Legend at 0x7fe09c5158e0>
But what if non-traditional classifier does not predict correct probabilities? Then this also does not help. Biased twoards the negative class because there were more negative examples during training.
plt.figure(figsize=(30,10))
plt.scatter(x_train,s_train)
plt.plot(x,non_traditional_classifier_probs, label="Non traditional classifier $Pr(s=1|x)")
plt.plot(x, pr_pos_if_x, label="True $Pr(y=1|x)$")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.hlines(tau_pu, xmin,xmax, 'grey', label="PU decision threshold")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.ylim([0,1.5])
plt.legend()
<matplotlib.legend.Legend at 0x7fe09ecd9910>
w_neg = n_lab
w_pos = n_unl
The probabilities are not correct, but the decision threshold is expected to be correct
distr_x_if_lab_weighted = distr_x_if_lab * w_pos
distr_x_if_unl_weighted = distr_x_if_unl * w_neg
distr_x_weighted = distr_x_if_lab_weighted+distr_x_if_unl_weighted
pr_lab_if_x_weighted = distr_x_if_lab_weighted/distr_x_weighted
fig, ax1 = plt.subplots(figsize=(15,5))
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_lab_weighted, label="weighted labeled distribution")
ax1.plot(x,distr_x_if_unl_weighted, label="weighted unlabeled distribution")
ax1.plot(x,distr_x_weighted, label="weighted total distribution")
ax2 = ax1.twinx()
color = 'tab:pink'
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim([0,1])
ax2.set_ylabel('$\Pr(s=1|x)$', color=color) # we already handled the x-label with ax1
ax2.plot(x,pr_lab_if_x, color=color)
fig.legend()
fig.tight_layout()
plt.figure(figsize=(15,5))
plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="$\Pr(y=1|x)$")
plt.plot(x,pr_lab_if_x_weighted, label="$\Pr(s=1| $weighted $ x)$")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()
<ipython-input-20-b0f3ef612ab6>:6: RuntimeWarning: invalid value encountered in true_divide pr_lab_if_x_weighted = distr_x_if_lab_weighted/distr_x_weighted
<matplotlib.legend.Legend at 0x7fe09d350820>
Again, our model is not able to predict the correct probabilities and is biased towards the negative examples, because that space was more clearly negative. So now, not only the probabilities are incorrect, but also the decision threshold.
weights = (s_train==0)*w_neg + (s_train==1)*w_pos
weighted_model = ModelType().fit(x_train,s_train, sample_weight=weights)
weighted_probs = weighted_model.predict_proba(x.reshape((-1,1)))[:,1]
plt.figure(figsize=(15,5))
plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="$True \Pr(y=1|x)$")
plt.plot(x,weighted_probs, label="Model from class-weighted instances $\Pr(y=1|x)$")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()
<matplotlib.legend.Legend at 0x7fe09de870a0>
Yes!
pr_pos_if_unl = (1-c)/c * pr_lab_if_x/(1-pr_lab_if_x)
distr_x_if_unl_pos = pr_pos_if_unl * distr_x_if_unl
distr_x_if_unl_neg = (1- pr_pos_if_unl) * distr_x_if_unl
fig, ax1 = plt.subplots(figsize=(15,5))
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution")
ax1.plot(x,distr_x_if_neg, label="negative distribution")
ax1.plot(x,distr_x_if_lab + distr_x_if_unl_pos, '--', label="positive distribution (calculated based on s)", color='r',dashes=(20, 20))
ax1.plot(x,distr_x_if_unl_neg, '--', label="negative distribution (calculated based on s)", color='g',dashes=(20, 20))
fig.legend()
fig.tight_layout()
Again we have the problem of the bias in learning the non-traditional classifier, which prevents us from correctly weighting the instances. Additionally, our non-traditional classifier learned probabilities $\Pr(s=1|x)>c$, which results in probabiliteis $\Pr(y=1|s=0,x)>1$ which is impossible, so we need to cut them off at 1.
This together makes it not work as well in practice as in theory. But still, the resulting model is better then the class-weighted instances, which we also expected from the theory.
x_train_lab = x_train[s_train==1]
x_train_unl = x_train[s_train==0]
train_s_probs_unl = non_traditional_classifier.predict_proba(x_train_unl)[:,1]
pr_pos_if_unl_train = (1-c)/c * train_s_probs_unl/(1-train_s_probs_unl)
pr_pos_if_unl_train[pr_pos_if_unl_train>0.99] = 0.99
x_train_EN = np.concatenate((x_train_lab, x_train_unl, x_train_unl)) # duplicate unlabeled examples
y_train_EN = np.concatenate((np.ones(n_lab+n_unl), np.zeros(n_unl)))
weights_EN = np.concatenate((np.ones(n_lab), pr_pos_if_unl_train, 1-pr_pos_if_unl_train ))
EN_weighted_model = ModelType().fit(x_train_EN,y_train_EN, sample_weight=weights_EN)
EN_weighted_probs = EN_weighted_model.predict_proba(x.reshape((-1,1)))[:,1]
plt.figure(figsize=(15,5))
plt.ylabel("$\Pr$")
# plt.plot(x,non_traditional_classifier_probs, label="$Predicted \Pr(s=1|x)$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$")
plt.plot(x,EN_weighted_probs, label="EN Model $\Pr(y=1| x)$")
plt.plot(x,weighted_probs, label="Model from class-weighted instances $\Pr(y=1|x)$")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()
<matplotlib.legend.Legend at 0x7fe0839ec5b0>
Yes!
distr_x_pos_rescaling_lab = 1/c * distr_x_if_lab
distr_x_neg_total_minus_pos = distr_x - distr_x_pos_rescaling_lab
fig, ax1 = plt.subplots(figsize=(15,5))
color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution")
ax1.plot(x,distr_x_if_neg, label="negative distribution")
ax1.plot(x,distr_x_pos_rescaling_lab, '--', label="positive distribution by rescaling labeled distribution", color='r',dashes=(20, 20))
ax1.plot(x,distr_x_neg_total_minus_pos, '--', label="negative distribution by subtracting positive distribution from total", color='g',dashes=(20, 20))
fig.legend()
fig.tight_layout()
Yes! But your model needs to be able to handle negative weights, and this is not always the case. ==> Method modification possible to handle edge cases
x_train_lab = x_train[s_train==1]
x_train_unl = x_train[s_train==0]
x_train_ERM = np.concatenate((x_train_lab, x_train_lab, x_train_unl)) # duplicate labeled examples
y_train_ERM = np.concatenate((np.ones(n_lab), np.zeros(n_lab+n_unl)))
weights_ERM = np.concatenate((np.ones(n_lab)*1/c, np.ones(n_lab)*(1-1/c),np.ones(n_unl)))
ERM_weighted_model = ModelType().fit(x_train_ERM,y_train_ERM, sample_weight=weights_ERM)
ERM_weighted_probs = ERM_weighted_model.predict_proba(x.reshape((-1,1)))[:,1]
plt.figure(figsize=(15,5))
plt.ylabel("$\Pr$")
# plt.plot(x,non_traditional_classifier_probs, label="$Predicted \Pr(s=1|x)$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$")
plt.plot(x,ERM_weighted_probs, '--', label="ERM Model $\Pr(y=1| x)$", dashes=(20,20))
plt.plot(x,EN_weighted_probs, label="EN Model $\Pr(y=1| x)$")
plt.plot(x,weighted_probs, label="Model from class-weighted instances $\Pr(y=1|x)$")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()
<matplotlib.legend.Legend at 0x7fe086a764c0>