We will build a classification model to distinguish between two types of cancer, acute lymphoblastic leukemia (ALL) and acute myeloid leukemia (AML), using gene expression measurements. Our data will be such that 0 indicates the ALL class and 1 indicates the AML class.
We will use logistic regression to build a classification model for this data set. We will start with the naive approach of using a linear regression classifier, to show why logistic regression is preferable.
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import seaborn as sns
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
%matplotlib inline
np.random.seed(9001)
df = pd.read_csv('dataset_gene.csv')
msk = np.random.rand(len(df)) < 0.5
data_train = df[msk]
data_test = df[~msk]
data_train.head()
df_train = data_train.copy()
df_train.head()
df_test = data_test.copy()
df_train.head()
df_train = df_train.set_index('Cancer_type')
df_test = df_test.set_index('Cancer_type')
df_train = (df_train - df_train.min(axis =0))/(df_train.max(axis =0) - df_train.min(axis = 0))
df_test = (df_test - df_test.min(axis =0))/(df_test.max(axis =0) - df_test.min(axis = 0))
df_train = df_train.sort_index()
for name, group in df_train[['D49818_at', 'M23161_at', 'hum_alu_at', 'AFFX-PheX-5_at', 'M15990_at']].groupby(by = df_train.index):
plt.figure(figsize = (10, 6))
sns.heatmap(group, cmap ="Blues")
plt.show()
M23161_at seems to have the greatest gradient distinction, indicating that it may be the most useful for classifying cancer type.
X_train = df_train
X_test = df_test
y_train = df_train.index
y_test = df_test.index
pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train)
X_train_pca_df = pd.DataFrame(X_train_pca)
X_train_pca_df["Cancer_type"] = y_train.values
X_train_pca_df["Cancer_type"] = X_train_pca_df["Cancer_type"].apply(lambda x:'#6da0f2'if x==0 else '#f28a60' )
plt.scatter(X_train_pca_df.iloc[:, 0], X_train_pca_df.iloc[:, 1] ,c=X_train_pca_df["Cancer_type"])
plt.scatter(X_train_pca_df.iloc[:, 0], X_train_pca_df.iloc[:, 1] ,c=X_train_pca_df["Cancer_type"])
leg = plt.legend((0,1))
leg_colors = ['#6da0f2', '#f28a60']
i=0
for marker in leg.legendHandles:
marker.set_color(leg_colors[i])
i+=1
There is no clear decision boundary generated by a 2D plot of the PCA at 2 components.
The problem with interpreting the scores predicted by the regression model as the probability that the patient has the ALL type cancer is that the linear regression model can predict values outside the interval (0, 1). We should not be getting probabilities outside this range.
X_train_constant = sm.add_constant(X_train['D29963_at'])
X_test_constant = sm.add_constant(X_test['D29963_at'])
linreg = OLS(y_train, X_train_constant).fit()
linreg.summary()
lin_class_train_predictions = linreg.predict(X_train_constant).apply(lambda x: 1 if x>.5 else 0)
lin_class_test_predictions = linreg.predict(X_test_constant).apply(lambda x: 1 if x>.5 else 0)
def get_classification_rate(lin_class_predictions):
misclassifications = 0
for tup in zip(lin_class_predictions.index, lin_class_predictions.values):
if tup[0] != tup[1]:
misclassifications +=1
class_rate = 1 - misclassifications/ lin_class_predictions.shape[0]
return class_rate
print("OLS binary classification\n")
print('training classification rate: ', get_classification_rate(lin_class_train_predictions))
print('testing classification rate: ', get_classification_rate(lin_class_test_predictions))
logreg = LogisticRegression(C = 100000, fit_intercept= False).fit(X_train_constant, y_train)
print('Logistic regression\n')
print('training score: ', logreg.score(X_train_constant, y_train))
print('test score: ',logreg.score(X_test_constant, y_test))
plt.scatter(X_train['D29963_at'], linreg.predict(X_train_constant), label = 'ols')
plt.scatter(X_train['D29963_at'], logreg.predict_proba(X_train_constant)[:, 1], label = 'logreg')
plt.scatter(X_train['D29963_at'], y_train, label = 'cancer type')
plt.legend()
X_train_const_mult = sm.add_constant(X_train)
X_test_const_mult = sm.add_constant(X_test)
logreg_model = LogisticRegression(C = 100000).fit(X_train_const_mult, y_train)
print("Training score: ", logreg_model.score(X_train_const_mult, y_train))
print("Test score: ", logreg_model.score(X_test_const_mult, y_test))
def visualize_prob(model, x, y, ax):
# Use the model to predict probabilities for
y_pred = model.predict_proba(x)
# Separate the predictions on the label 1 and label 0 points
ypos = y_pred[y==1]
yneg = y_pred[y==0]
# Count the number of label 1 and label 0 points
npos = ypos.shape[0]
nneg = yneg.shape[0]
# Plot the probabilities on a vertical line at x = 0,
# with the positive points in blue and negative points in red
pos_handle = ax.plot(np.zeros((npos,1)), ypos[:,1], 'bx', label = 'ALL')
neg_handle = ax.plot(np.zeros((nneg,1)), yneg[:,1], 'rx', label = 'AML')
# Line to mark prob 0.5
ax.axhline(y = 0.5, color = 'k', linestyle = '--')
# Add y-label and legend, do not display x-axis, set y-axis limit
ax.set_ylabel('Probability of AML class')
ax.legend(loc = 'best')
ax.get_xaxis().set_visible(False)
ax.set_ylim([0,1])
f, ax = plt.subplots(2, 1, figsize = (5, 10))
visualize_prob(model = logreg_model, x = X_train_const_mult,y = y_train, ax = ax[0])
ax[0].set_title("Training")
visualize_prob(logreg_model, X_test_const_mult, y_test, ax[1])
ax[1].set_title("Test")
For the points that are closer to .5 we can say that there is only a slightly higher probability they belong in the AML class rather than the ALL class
Let's see how many of the coefficients estimated by the multiple logistic regression are significantly different from zero at a significance level of 95%
coef_logreg = logreg_model.predict(X_train_const_mult)
coef_boot = np.zeros((X_train_const_mult.shape[1],100))
for i in range(100):
boot_sample = np.random.choice(range(X_train_const_mult.shape[0]), size=X_train_const_mult.shape[0])
X_train_boot = X_train_const_mult.values[boot_sample]
y_train_boot = y_train.values[boot_sample]
logreg_boot = LogisticRegression(C=100000, fit_intercept=False)
logreg_boot.fit(X_train_boot, y_train_boot)
coef_boot[:,i] = logreg_boot.coef_
#Check distribution of coef values
#for row in coef_boot.T:
# plt.hist(row)
#plt.show()
def get_confidence_interval(coef_matrix):
confidence_intervals = []
for row in coef_matrix:
upper = np.mean(row) + 1.96*np.std(row)
lower = np.mean(row) - 1.96*np.std(row)
confidence_intervals.append((lower, upper))
return confidence_intervals
sig_coef = 0
for co in get_confidence_interval(coef_boot):
if 0< co[0] or 0> co[1]:
sig_coef +=1
print(sig_coef)
PCA is a method for reducing the dimensionality of our data. To reduce the dimensionality of the data we will use PCA and fit a logistic regression model on the first set of components contributing to 90% of the variance in the predictors.
#the number of components that explain 90% of variance
n = 1
exp_var = 0
while exp_var <.9:
n+=1
pca = PCA(n_components= n).fit(X_train, y_train)
exp_var = np.sum(pca.explained_variance_ratio_)
print(n, "components")
#transform data based on PCA for logit
X_train_pca = pca.transform(X_train)[:, :n]
X_test_pca = pca.transform(X_test)[:, :n]
#fit logistics regression model
logreg_pca = LogisticRegression(C=100000).fit(X_train_pca, y_train)
pca_xtrain_pred, pca_xtest_pred = logreg_pca.predict(X_train_pca), logreg_pca.predict(X_test_pca)
accuracy_score(y_train, pca_xtrain_pred), accuracy_score(y_test, pca_xtest_pred)
test_accuracies = np.zeros((30, 5))
training_accuracies = np.zeros((30, 5))
X_values = df.drop('Cancer_type', axis = 1)
y_values = df["Cancer_type"]
fold = 0
for x_ind, v_ind in KFold(n_splits = 5, shuffle = True, random_state = 9001).split(X_train.index):
X_train_fd = X_values.iloc[x_ind,:]
X_test_fd = X_values.iloc[v_ind,:]
y_train_fd = y_values[x_ind]
y_test_fd = y_values[v_ind]
pca = PCA().fit(X_train_fd)
X_train_fd_pca = pca.transform(X_train_fd)
X_test_fd_pca = pca.transform(X_test_fd)
for n in range(2, 30):
logit_pca_cv = LogisticRegression(C = 100000).fit(X_train_fd_pca[:, :n], y_train_fd)
training_accuracies[n, fold] = accuracy_score(y_train_fd, logit_pca_cv.predict(X_train_fd_pca[:, :n]))
test_accuracies[n, fold] = accuracy_score(y_test_fd, logit_pca_cv.predict(X_test_fd_pca[:, :n]))
fold +=1
mean_training_ac, mean_test_ac = np.mean(training_accuracies, axis = 1), np.mean(test_accuracies, axis =1)
for n, value in enumerate(mean_test_ac):
if value < 0.926829268292683:
pass
if mean_training_ac[n] ==1:
n_components = n
break
n_components
pca = PCA(n_components).fit(X_train, y_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
logit_pca_cv_fin = LogisticRegression(C = 100000).fit(X_train_pca[:, :n_components], y_train)
accuracy_score(y_train, logit_pca_cv_fin.predict(X_train_pca[:, :n_components]))
accuracy_score(y_test, logit_pca_cv_fin.predict(X_test_pca[:, :n_components]))
f, axes = plt.subplots(1, 1, figsize = (5, 5))
visualize_prob(model = logit_pca_cv_fin, x = X_test_pca[:, :n_components], y = y_test, ax = axes)
With the logit model that uses all predictors we risk finding spurious relationships between the predictor variables and our target.
Using PCA we wanted to find some lower dimensional representation of our predictors that retains our predictive capability but is less vulnerable to overfitting.
We've also given an example for why logisitic regression is useful for these kinds of predictive problems, as it gives us some rough intution for probabilities using Niave Bayes.