# IMPORTS
# base
from warnings import simplefilter
# external
from plotly.offline import init_notebook_mode
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import plotly.express as px
# local
from utils.utils import * # import all functions and global variables
# NOTEBOOK SETTINGS
init_notebook_mode(connected=True)
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action="ignore", category=UserWarning)
simplefilter(action="ignore", category=ConvergenceWarning)
simplefilter(action="ignore", category=FitFailedWarning)
data_file = "data/GSE66099.csv"
DEGs_file = "data/GSE66099_DEGs.txt" # None -> analysis of all genes
label = "Died"
metadata_cols = ["Gender", "Age", "PRISM"]
misc_cols = ["SID", "StudyID", "Type"]
random_seed = 1
Feature | Description |
---|---|
SID | Patient ID |
A2M - ZYX | Investigated genes |
StudyID | GEO study accession number |
Gender | male/female |
Age | 0.0 - 14.8 |
PRISM | Pediatric RISc of Mortality |
Died | Did the patient die? 1 - yes, 0 - no |
input = pd.read_csv(data_file)
input.head()
# gene expression profile
metadata = input[metadata_cols]
expression_profile = input.drop(columns=misc_cols + [label] + metadata_cols)
y = input[label].values.T
# select differentially expressed genes (DEGs)
if DEGs_file:
DEGs = pd.read_csv(DEGs_file, header=None).values
DEGs = DEGs.reshape(len(DEGs))
expression_profile = expression_profile[DEGs]
# concatenate expression profile and metadata
X = pd.concat([expression_profile, metadata], axis=1)
# convert gender to numeric data
X["Gender"] = pd.get_dummies(X["Gender"], drop_first=True).astype(int)
X.head()
meta_df = input[[label] + metadata_cols]
for col in meta_df:
value_dict = dict(meta_df[col].value_counts())
categories = [col for _ in value_dict]
metadata_vcs = [list(value_dict.values()), list(value_dict.keys()), categories]
metadata_vcs = np.array(metadata_vcs).T
metadata_vcs = pd.DataFrame(metadata_vcs, columns=["Count", "Label", "Category"])
metadata_bar = px.histogram(metadata_vcs, y="Category", color="Label", x="Count", orientation="h")
metadata_bar.update_layout(dict(
title=f"{col} proportions",
yaxis_title="",
xaxis_title="Counts",
yaxis_showticklabels=False,
xaxis_showticklabels=False,
legend_title=col,
width=1000,
height=300,
))
metadata_bar.show()
# only plot expression profile when using DEGs_file
# otherwise the plot is very large and demanding for GPU to render
if DEGs_file:
ep_heatmap_all = px.imshow(expression_profile.values, x=DEGs, width=2000, height=500)
ep_heatmap_all.update_layout(dict(
title="Expression profiles of all patients",
yaxis_title="",
yaxis_showticklabels=False,
xaxis_title="Gene",
coloraxis_colorbar=dict(title="Expression level")
))
ep_heatmap_all.show()
if DEGs_file:
ep_heatmap1 = px.imshow(expression_profile.values[np.where(y == 1)],
x=DEGs, width=2000, height=500)
ep_heatmap1.update_layout(dict(
title="Expression profiles of patients who died",
yaxis_title="",
yaxis_showticklabels=False,
xaxis_title="Gene",
coloraxis_colorbar=dict(title="Expression level")
))
ep_heatmap1.show()
if DEGs_file:
ep_heatmap0 = px.imshow(expression_profile.values[np.where(y == 0)],
x=DEGs, width=2000, height=500)
ep_heatmap0.update_layout(dict(
title="Expression profiles of patients who did not die",
yaxis_title="",
yaxis_showticklabels=False,
xaxis_title="Gene",
coloraxis_colorbar=dict(title="Expression level")
))
ep_heatmap0.show()
Due to the small number of observations (199) only 20% is selected as a test set.
train_X, test_X, train_y, test_y = train_test_split(
X, y, test_size=0.2, random_state=random_seed
)
Hyperparameter tuning was done automatically using grid_search()
method.
rfc = RandomForestClassifier()
rfc.get_params()
rfc_param_grid = dict(
random_state=[random_seed],
n_estimators=[10, 50, 100, 250, 500],
max_depth=range(2, 6),
)
rfc_grid_search = GridSearchCV(
estimator=rfc,
param_grid=rfc_param_grid
)
lrc = Pipeline([
("std", StandardScaler()),
("lrc", LogisticRegression())
])
lrc.get_params()
lrc_param_grid = dict(
lrc__random_state=[random_seed],
lrc__C=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
lrc__penalty=["l2", "none"],
lrc__solver=["lbfgs", "liblinear", "newton-cg", "sag", "saga"],
lrc__max_iter=[1000]
)
lrc_grid_search = GridSearchCV(
estimator=lrc,
param_grid=lrc_param_grid
)
Training Random Forest Classifier on all DEGs.
rfc_grid_search.fit(train_X, train_y)
print(rfc_grid_search.best_params_)
rfc_v1 = rfc_grid_search.best_estimator_
evaluate(rfc_v1, test_X, test_y, "rfc_all")
Training Logistic Regression Classifier on all DEGs.
lrc_grid_search.fit(train_X, train_y)
print(lrc_grid_search.best_params_)
lrc_v1 = lrc_grid_search.best_estimator_
evaluate(lrc_v1, test_X, test_y, "lrc_all")
Training again on the whole dataset to not decrease selector performance.
rfc_grid_search.fit(X, y)
print(rfc_grid_search.best_params_)
rfc_selector = rfc_grid_search.best_estimator_
Visualising feature importance.
importances = rfc_selector.feature_importances_
idxs = np.argsort(importances)[::-1] # sort descending
feature_importance = px.bar(y=importances[idxs], x=X.columns[idxs])
feature_importance.update_layout(dict(
yaxis_title=r"Importance [%]",
xaxis_title="Feature",
))
Selecting features.
selector = SelectFromModel(
rfc_selector,
threshold=0.01,
prefit=True
)
train_X_sel = selector.transform(train_X.values)
test_X_sel = selector.transform(test_X.values)
print(f"Selected {train_X_sel.shape[1]} features.")
importances = rfc_selector.feature_importances_
idxs = np.argsort(importances)[::-1] # sort descending
feature_importance_sel = px.bar(y=importances[idxs], x=X.columns[idxs])
feature_importance_sel.add_vline(x=train_X_sel.shape[1]+0.5, line_dash="dash", line_color="green")
feature_importance_sel.add_vrect(
x0=-0.5, x1=train_X_sel.shape[1]+0.5, line_width=0, fillcolor="green", opacity=0.2
)
feature_importance_sel.update_layout(dict(
yaxis_title=r"Importance [%]",
xaxis_title="Feature",
))
pca = PCA()
pca.fit(train_X.values)
pca_expl_var = px.line(y=np.cumsum(pca.explained_variance_ratio_), line_shape="hv",
color=px.Constant("Cumulative explained variance"))
pca_expl_var.add_bar(y=pca.explained_variance_ratio_, name="Explained variance")
# fill and vertical line at x=13
pca_expl_var.add_vline(x=13, line_dash="dash", line_color="green")
pca_expl_var.add_vrect(x0=-0.5, x1=13, line_width=0, fillcolor="green", opacity=0.2)
# fill and vertical line at x=34
pca_expl_var.add_vline(x=train_X_sel.shape[1]+0.5, line_dash="dash", line_color="red")
pca_expl_var.add_vrect(x0=13, x1=train_X_sel.shape[1]+0.5, line_width=0, fillcolor="red", opacity=0.2)
pca_expl_var.update_layout(dict(
yaxis_title=r"% of explained variance",
xaxis_title="Number of principal components",
))
With 13 PCs Cumulative explained variance > 0.9.
extractor_min = PCA(n_components=13)
train_X_ext_min = extractor_min.fit_transform(train_X.values)
test_X_ext_min = extractor_min.fit_transform(test_X.values)
print(f"Extracted {train_X_ext_min.shape[1]} features.")
What if number of PCs is equal to number of features selected with RFC?
extractor_max = PCA(n_components=train_X_sel.shape[1])
train_X_ext_max = extractor_max.fit_transform(train_X.values)
test_X_ext_max = extractor_max.fit_transform(test_X.values)
print(f"Extracted {train_X_ext_max.shape[1]} features.")
Training Random Forest Classifier using selected features.
rfc_grid_search.fit(train_X_sel, train_y)
print(rfc_grid_search.best_params_)
rfc_v2 = rfc_grid_search.best_estimator_
evaluate(rfc_v2, test_X_sel, test_y, "rfc_sel")
Training Random Forest Classifier using extracted features (less PCs).
rfc_grid_search.fit(train_X_ext_min, train_y)
print(rfc_grid_search.best_params_)
rfc_v3 = rfc_grid_search.best_estimator_
evaluate(rfc_v3, test_X_ext_min, test_y, "rfc_ext_less")
Training Random Forest Classifier using extracted features (more PCs).
rfc_grid_search.fit(train_X_ext_max, train_y)
print(rfc_grid_search.best_params_)
rfc_v4 = rfc_grid_search.best_estimator_
evaluate(rfc_v4, test_X_ext_max, test_y, "rfc_ext_more")
Training Logistic Regression Classifier using selected features.
lrc_grid_search.fit(train_X_sel, train_y)
print(lrc_grid_search.best_params_)
lrc_v2 = lrc_grid_search.best_estimator_
evaluate(lrc_v2, test_X_sel, test_y, "lrc_sel")
Training Logistic Regression Classifier using extracted features (less PCs).
lrc_grid_search.fit(train_X_ext_min, train_y)
print(lrc_grid_search.best_params_)
lrc_v3 = lrc_grid_search.best_estimator_
evaluate(lrc_v3, test_X_ext_min, test_y, "lrc_ext_min")
Training Logistic Regression Classifier using extracted features (more PCs).
lrc_grid_search.fit(train_X_ext_max, train_y)
print(lrc_grid_search.best_params_)
lrc_v4 = lrc_grid_search.best_estimator_
evaluate(lrc_v4, test_X_ext_max, test_y, "lrc_ext_max")
models = list(acc_scores.keys())
model_scores = list(acc_scores.values())
model_comp = px.bar(x=models, y=model_scores)
model_comp.update_layout(dict(
yaxis_title="Accuracy",
xaxis_title="Model",
))
for t, f, fig_title in zip(tprs.values(), fprs.values(), tprs.keys()):
fig = px.line(x=f, y=t, width=1000, height=1000)
fig.update_layout(dict(
title=fig_title,
xaxis_title="FPR",
yaxis_title="TPR",
shapes=[
dict(
type='line', line_dash="dash",
yref='y', y0=0, y1=1,
xref='x', x0=0, x1= 1
)
]
))
fig.show()
px.bar(y=list(auc_scores.values()), x=list(auc_scores.keys()))
model_comp.update_layout(dict(
yaxis_title="AUC",
xaxis_title="Model",
))