import numpy as np
import sklearn.linear_model
import matplotlib.pyplot as plt
from scipy.stats import norm, uniform


sample_size=100000
neg_start=-1
pos_start=0
class_width =1
c = 0.3
alpha = 0.5
ModelType = lambda : sklearn.linear_model.LogisticRegression(penalty="none",max_iter=50000)
tau = 0.5 # decision threshold


xmin,xmax = -1.1,1.1

x = np.linspace(xmin,xmax, 100000)

distr_x_if_pos =  alpha * uniform.pdf(x,pos_start, class_width)
distr_x_if_neg =  (1-alpha) * uniform.pdf(x,neg_start,class_width)
distr_x = distr_x_if_pos+distr_x_if_neg
distr_x_if_lab = distr_x_if_pos*c
distr_x_if_unl = distr_x - distr_x_if_lab


pr_pos_if_x = distr_x_if_pos/distr_x
pr_lab_if_x = distr_x_if_lab/distr_x


fig, ax1 = plt.subplots(figsize=(15,5))

color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution")
ax1.plot(x,distr_x_if_neg, label="negative distribution")
# ax1.plot(x,distr_x, label="total distribution")

ax2 = ax1.twinx()
color = 'tab:pink'
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim([0,1])

ax2.set_ylabel('$\Pr(y=1|x)$', color=color)  # we already handled the x-label with ax1
ax2.plot(x,pr_pos_if_x, color=color)

fig.legend()
fig.tight_layout()

fig, ax1 = plt.subplots(figsize=(15,5))

color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_lab, label="labeled distribution")
ax1.plot(x,distr_x_if_unl, label="unlabeled distribution")
# ax1.plot(x,distr_x, label="total distribution")

ax2 = ax1.twinx()
color = 'tab:pink'
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim([0,1])

ax2.set_ylabel('$\Pr(s=1|x)$', color=color)  # we already handled the x-label with ax1
ax2.plot(x,pr_lab_if_x, color=color)

fig.legend()
fig.tight_layout()

plt.figure(figsize=(15,5))

plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$")
plt.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$")
plt.legend()

<ipython-input-9-241bf70dbeea>:12: RuntimeWarning: invalid value encountered in true_divide
  pr_pos_if_x = distr_x_if_pos/distr_x
<ipython-input-9-241bf70dbeea>:13: RuntimeWarning: invalid value encountered in true_divide
  pr_lab_if_x = distr_x_if_lab/distr_x

<matplotlib.legend.Legend at 0x7fe085125b20>


# sample training data from the distributions
neg_train = np.random.uniform(neg_start, neg_start+class_width, size=sample_size)
pos_train = np.random.uniform(pos_start, pos_start+class_width, size=sample_size)
n_lab = int(c*sample_size)
n_unl = sample_size*2-n_lab
lab_pos_train = pos_train[:n_lab]
unl_pos_train = pos_train[n_lab+1:]

x_train = np.concatenate((neg_train, pos_train)).reshape((-1,1))
y_train = np.array([0]*sample_size+[1]*sample_size)
s_train = np.array([0]*n_unl + [1]*n_lab)


golden_model = ModelType().fit(x_train,y_train)
golden_probs = golden_model.predict_proba(x.reshape((-1,1)))[:,1]


plt.figure(figsize=(15,5))
plt.ylabel("$\Pr(y=1|x)$")
plt.scatter(x_train,y_train, label = "training data (fully labeled)")
plt.plot(x,golden_probs, label="Golden model")
plt.plot(x, pr_pos_if_x, label="True probabilities", dashes=(20,20))
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()

<matplotlib.legend.Legend at 0x7fe0a152ca90>


plt.figure(figsize=(15,5))

plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$")
plt.plot(x,pr_lab_if_x, label="True $\Pr(s=1|x)$")
plt.plot(x,pr_lab_if_x/c, '--', label="True $\\frac{1}{c}\Pr(s=1|x)$",dashes=(20, 20), color='r')
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()

<matplotlib.legend.Legend at 0x7fe0a29bfee0>


non_traditional_classifier = ModelType().fit(x_train,s_train)
non_traditional_classifier_probs = non_traditional_classifier.predict_proba(x.reshape((-1,1)))[:,1]

scaled_non_traditional_classifier_probs = 1/c*non_traditional_classifier_probs


plt.figure(figsize=(30,10))
plt.scatter(x_train,s_train)
plt.plot(x,non_traditional_classifier_probs, label="Non traditional classifier $Pr(s=1|x)")
plt.plot(x,scaled_non_traditional_classifier_probs, label="Scaled non traditional classifier $\Pr(y=1|x)")
plt.plot(x, pr_pos_if_x, label="True $Pr(y=1|x)$")
plt.plot(x, pr_lab_if_x, label="True $Pr(s=1|x)$")
# plt.hlines(0.5, xmin,xmax, 'k', label="decision threshold")
# plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.ylim([0,1.5])
plt.legend()

<matplotlib.legend.Legend at 0x7fe0a1386370>


tau_pu = tau*c


plt.figure(figsize=(30,10))
plt.scatter(x_train,s_train)
plt.plot(x, pr_pos_if_x, label="True $Pr(y=1|x)$")
plt.plot(x, pr_lab_if_x, label="True $Pr(s=1|x)$")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.hlines(tau_pu, xmin,xmax, 'tab:gray', label="PU decision threshold")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.ylim([0,1.5])
plt.legend()

<matplotlib.legend.Legend at 0x7fe09c5158e0>


plt.figure(figsize=(30,10))
plt.scatter(x_train,s_train)
plt.plot(x,non_traditional_classifier_probs, label="Non traditional classifier $Pr(s=1|x)")
plt.plot(x, pr_pos_if_x, label="True $Pr(y=1|x)$")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.hlines(tau_pu, xmin,xmax, 'grey', label="PU decision threshold")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.ylim([0,1.5])
plt.legend()

<matplotlib.legend.Legend at 0x7fe09ecd9910>


w_neg = n_lab
w_pos = n_unl


distr_x_if_lab_weighted = distr_x_if_lab * w_pos
distr_x_if_unl_weighted = distr_x_if_unl * w_neg

distr_x_weighted = distr_x_if_lab_weighted+distr_x_if_unl_weighted

pr_lab_if_x_weighted = distr_x_if_lab_weighted/distr_x_weighted


fig, ax1 = plt.subplots(figsize=(15,5))

color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_lab_weighted, label="weighted labeled distribution")
ax1.plot(x,distr_x_if_unl_weighted, label="weighted unlabeled distribution")
ax1.plot(x,distr_x_weighted, label="weighted total distribution")

ax2 = ax1.twinx()
color = 'tab:pink'
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim([0,1])

ax2.set_ylabel('$\Pr(s=1|x)$', color=color)  # we already handled the x-label with ax1
ax2.plot(x,pr_lab_if_x, color=color)

fig.legend()
fig.tight_layout()

plt.figure(figsize=(15,5))

plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="$\Pr(y=1|x)$")
plt.plot(x,pr_lab_if_x_weighted, label="$\Pr(s=1| $weighted $ x)$")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()

<ipython-input-20-b0f3ef612ab6>:6: RuntimeWarning: invalid value encountered in true_divide
  pr_lab_if_x_weighted = distr_x_if_lab_weighted/distr_x_weighted

<matplotlib.legend.Legend at 0x7fe09d350820>


weights = (s_train==0)*w_neg + (s_train==1)*w_pos
weighted_model = ModelType().fit(x_train,s_train, sample_weight=weights)
weighted_probs = weighted_model.predict_proba(x.reshape((-1,1)))[:,1]


plt.figure(figsize=(15,5))

plt.ylabel("$\Pr$")
plt.plot(x,pr_pos_if_x, label="$True \Pr(y=1|x)$")
plt.plot(x,weighted_probs, label="Model from class-weighted instances $\Pr(y=1|x)$")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()

<matplotlib.legend.Legend at 0x7fe09de870a0>


pr_pos_if_unl = (1-c)/c * pr_lab_if_x/(1-pr_lab_if_x)
distr_x_if_unl_pos = pr_pos_if_unl * distr_x_if_unl
distr_x_if_unl_neg = (1- pr_pos_if_unl) * distr_x_if_unl

fig, ax1 = plt.subplots(figsize=(15,5))

color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution")
ax1.plot(x,distr_x_if_neg, label="negative distribution")

ax1.plot(x,distr_x_if_lab + distr_x_if_unl_pos, '--', label="positive distribution (calculated based on s)", color='r',dashes=(20, 20))
ax1.plot(x,distr_x_if_unl_neg, '--', label="negative distribution (calculated based on s)", color='g',dashes=(20, 20))


fig.legend()
fig.tight_layout()


x_train_lab = x_train[s_train==1]
x_train_unl = x_train[s_train==0]

train_s_probs_unl = non_traditional_classifier.predict_proba(x_train_unl)[:,1]

pr_pos_if_unl_train = (1-c)/c * train_s_probs_unl/(1-train_s_probs_unl)
pr_pos_if_unl_train[pr_pos_if_unl_train>0.99] = 0.99

x_train_EN = np.concatenate((x_train_lab, x_train_unl, x_train_unl)) # duplicate unlabeled examples
y_train_EN = np.concatenate((np.ones(n_lab+n_unl), np.zeros(n_unl)))
weights_EN = np.concatenate((np.ones(n_lab), pr_pos_if_unl_train, 1-pr_pos_if_unl_train ))

EN_weighted_model = ModelType().fit(x_train_EN,y_train_EN, sample_weight=weights_EN)
EN_weighted_probs = EN_weighted_model.predict_proba(x.reshape((-1,1)))[:,1]


plt.figure(figsize=(15,5))

plt.ylabel("$\Pr$")
# plt.plot(x,non_traditional_classifier_probs, label="$Predicted \Pr(s=1|x)$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$")
plt.plot(x,EN_weighted_probs, label="EN Model $\Pr(y=1| x)$")
plt.plot(x,weighted_probs, label="Model from class-weighted instances $\Pr(y=1|x)$")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()

<matplotlib.legend.Legend at 0x7fe0839ec5b0>


distr_x_pos_rescaling_lab = 1/c * distr_x_if_lab
distr_x_neg_total_minus_pos = distr_x - distr_x_pos_rescaling_lab

fig, ax1 = plt.subplots(figsize=(15,5))

color = 'tab:red'
ax1.set_ylabel('data distribution')
ax1.plot(x,distr_x_if_pos, label="positive distribution")
ax1.plot(x,distr_x_if_neg, label="negative distribution")

ax1.plot(x,distr_x_pos_rescaling_lab, '--', label="positive distribution by rescaling labeled distribution", color='r',dashes=(20, 20))
ax1.plot(x,distr_x_neg_total_minus_pos, '--', label="negative distribution by subtracting positive distribution from total", color='g',dashes=(20, 20))


fig.legend()
fig.tight_layout()


x_train_lab = x_train[s_train==1]
x_train_unl = x_train[s_train==0]

x_train_ERM = np.concatenate((x_train_lab, x_train_lab, x_train_unl)) # duplicate labeled examples
y_train_ERM = np.concatenate((np.ones(n_lab), np.zeros(n_lab+n_unl)))
weights_ERM = np.concatenate((np.ones(n_lab)*1/c, np.ones(n_lab)*(1-1/c),np.ones(n_unl)))

ERM_weighted_model = ModelType().fit(x_train_ERM,y_train_ERM, sample_weight=weights_ERM)
ERM_weighted_probs = ERM_weighted_model.predict_proba(x.reshape((-1,1)))[:,1]


plt.figure(figsize=(15,5))

plt.ylabel("$\Pr$")
# plt.plot(x,non_traditional_classifier_probs, label="$Predicted \Pr(s=1|x)$")
plt.plot(x,pr_pos_if_x, label="True $\Pr(y=1|x)$")
plt.plot(x,ERM_weighted_probs, '--', label="ERM Model $\Pr(y=1| x)$", dashes=(20,20))
plt.plot(x,EN_weighted_probs, label="EN Model $\Pr(y=1| x)$")
plt.plot(x,weighted_probs, label="Model from class-weighted instances $\Pr(y=1|x)$")
plt.vlines(0, 0,1, 'tab:pink', label="correct decision boundary")
plt.hlines(tau, xmin,xmax, 'k', label="decision threshold")
plt.legend()

<matplotlib.legend.Legend at 0x7fe086a764c0>

Parameters¶

Distributions and data¶

training data¶

Golden standard: training a model from the fully labeled data¶

Postprocessing¶

scaling the probabilities of a non-traditional classifier¶

Does this work in theory? I.e. if correct probabilities $\Pr(s=1|x)$ are found?¶

Does the non-traditional classifier work?¶

Moving the decision threshold¶

Preprocessing: per class reweighting¶

Does this work in Theory?¶

Does this work in practice?¶

Preprocessing: per instance reweighting, using their probability to be labeled¶

Does this work in theory?¶

Does this work in practice?¶

Preprocessing: per instance weighting (risk minimization)¶

Does this work in theory?¶

Does this work in practice?¶