SIRS morbidity prediction from expression profiles¶

Available at github.

Libraries and settings¶

# IMPORTS

# base
from warnings import simplefilter

# external
from plotly.offline import init_notebook_mode
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import plotly.express as px

# local
from utils.utils import *  # import all functions and global variables

# NOTEBOOK SETTINGS
init_notebook_mode(connected=True)
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action="ignore", category=UserWarning)
simplefilter(action="ignore", category=ConvergenceWarning)
simplefilter(action="ignore", category=FitFailedWarning)

Pipeline flowchart¶

Global variables¶

data_file = "data/GSE66099.csv"
DEGs_file = "data/GSE66099_DEGs.txt"  # None -> analysis of all genes

label = "Died"
metadata_cols = ["Gender", "Age", "PRISM"]
misc_cols = ["SID", "StudyID", "Type"]

random_seed = 1

Feature description¶

Feature	Description
SID	Patient ID
A2M - ZYX	Investigated genes
StudyID	GEO study accession number
Gender	male/female
Age	0.0 - 14.8
PRISM	Pediatric RISc of Mortality
Died	Did the patient die? 1 - yes, 0 - no

Loading the data¶

input = pd.read_csv(data_file)
input.head()

# gene expression profile
metadata = input[metadata_cols]
expression_profile = input.drop(columns=misc_cols + [label] + metadata_cols)
y = input[label].values.T

# select differentially expressed genes (DEGs)
if DEGs_file:
    DEGs = pd.read_csv(DEGs_file, header=None).values
    DEGs = DEGs.reshape(len(DEGs))
    expression_profile = expression_profile[DEGs]

# concatenate expression profile and metadata
X = pd.concat([expression_profile, metadata], axis=1)

# convert gender to numeric data
X["Gender"] = pd.get_dummies(X["Gender"], drop_first=True).astype(int)
X.head()

Visualisation of metadata¶

meta_df = input[[label] + metadata_cols]
for col in meta_df:
    value_dict = dict(meta_df[col].value_counts())
    categories = [col for _ in value_dict]
    metadata_vcs = [list(value_dict.values()), list(value_dict.keys()), categories]
    metadata_vcs = np.array(metadata_vcs).T
    metadata_vcs = pd.DataFrame(metadata_vcs, columns=["Count", "Label", "Category"])
    metadata_bar = px.histogram(metadata_vcs, y="Category", color="Label", x="Count", orientation="h")
    metadata_bar.update_layout(dict(
        title=f"{col} proportions",
        yaxis_title="",
        xaxis_title="Counts",
        yaxis_showticklabels=False,
        xaxis_showticklabels=False,
        legend_title=col,
        width=1000,
        height=300,
    ))
    metadata_bar.show()

Visualisation of DEGs' expression profiles¶

# only plot expression profile when using DEGs_file
# otherwise the plot is very large and demanding for GPU to render
if DEGs_file:
    ep_heatmap_all = px.imshow(expression_profile.values, x=DEGs, width=2000, height=500)
    ep_heatmap_all.update_layout(dict(
        title="Expression profiles of all patients",
        yaxis_title="",
        yaxis_showticklabels=False,
        xaxis_title="Gene",
        coloraxis_colorbar=dict(title="Expression level")
    ))
    ep_heatmap_all.show()

if DEGs_file:
    ep_heatmap1 = px.imshow(expression_profile.values[np.where(y == 1)],
                               x=DEGs, width=2000, height=500)
    ep_heatmap1.update_layout(dict(
        title="Expression profiles of patients who died",
        yaxis_title="",
        yaxis_showticklabels=False,
        xaxis_title="Gene",
        coloraxis_colorbar=dict(title="Expression level")
    ))
    ep_heatmap1.show()

if DEGs_file:
    ep_heatmap0 = px.imshow(expression_profile.values[np.where(y == 0)],
                               x=DEGs, width=2000, height=500)
    ep_heatmap0.update_layout(dict(
        title="Expression profiles of patients who did not die",
        yaxis_title="",
        yaxis_showticklabels=False,
        xaxis_title="Gene",
        coloraxis_colorbar=dict(title="Expression level")
    ))
    ep_heatmap0.show()

Finding the best model and data transformations¶

Splitting the dataset¶

Due to the small number of observations (199) only 20% is selected as a test set.

train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.2, random_state=random_seed
)

Model initialization and parameter grids¶

Hyperparameter tuning was done automatically using grid_search() method.

Random Forest Classifier¶

rfc = RandomForestClassifier()
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

rfc_param_grid = dict(
    random_state=[random_seed],
    n_estimators=[10, 50, 100, 250, 500],
    max_depth=range(2, 6),
)

rfc_grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=rfc_param_grid
)

Logistic Regression¶

lrc = Pipeline([
    ("std", StandardScaler()),
    ("lrc", LogisticRegression())
])
lrc.get_params()

{'memory': None,
 'steps': [('std', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('lrc',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'std': StandardScaler(copy=True, with_mean=True, with_std=True),
 'lrc': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'std__copy': True,
 'std__with_mean': True,
 'std__with_std': True,
 'lrc__C': 1.0,
 'lrc__class_weight': None,
 'lrc__dual': False,
 'lrc__fit_intercept': True,
 'lrc__intercept_scaling': 1,
 'lrc__l1_ratio': None,
 'lrc__max_iter': 100,
 'lrc__multi_class': 'auto',
 'lrc__n_jobs': None,
 'lrc__penalty': 'l2',
 'lrc__random_state': None,
 'lrc__solver': 'lbfgs',
 'lrc__tol': 0.0001,
 'lrc__verbose': 0,
 'lrc__warm_start': False}

lrc_param_grid = dict(
    lrc__random_state=[random_seed],
    lrc__C=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    lrc__penalty=["l2", "none"],
    lrc__solver=["lbfgs", "liblinear", "newton-cg", "sag", "saga"],
    lrc__max_iter=[1000]
)

lrc_grid_search = GridSearchCV(
    estimator=lrc,
    param_grid=lrc_param_grid
)

Model evaluation before feature selection/extraction¶

Training Random Forest Classifier on all DEGs.

rfc_grid_search.fit(train_X, train_y)
print(rfc_grid_search.best_params_)
rfc_v1 = rfc_grid_search.best_estimator_
evaluate(rfc_v1, test_X, test_y, "rfc_all")

{'max_depth': 4, 'n_estimators': 50, 'random_state': 1}

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        32
           1       0.60      0.38      0.46         8

    accuracy                           0.82        40
   macro avg       0.73      0.66      0.68        40
weighted avg       0.81      0.82      0.81        40

Training Logistic Regression Classifier on all DEGs.

lrc_grid_search.fit(train_X, train_y)
print(lrc_grid_search.best_params_)
lrc_v1 = lrc_grid_search.best_estimator_
evaluate(lrc_v1, test_X, test_y, "lrc_all")

{'lrc__C': 0.01, 'lrc__max_iter': 1000, 'lrc__penalty': 'l2', 'lrc__random_state': 1, 'lrc__solver': 'lbfgs'}

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        35
           1       0.60      0.60      0.60         5

    accuracy                           0.90        40
   macro avg       0.77      0.77      0.77        40
weighted avg       0.90      0.90      0.90        40

Feature selection via Random Forest Classifier¶

Training again on the whole dataset to not decrease selector performance.

rfc_grid_search.fit(X, y)
print(rfc_grid_search.best_params_)
rfc_selector = rfc_grid_search.best_estimator_

{'max_depth': 5, 'n_estimators': 250, 'random_state': 1}

Visualising feature importance.

importances = rfc_selector.feature_importances_
idxs = np.argsort(importances)[::-1]  # sort descending

feature_importance = px.bar(y=importances[idxs], x=X.columns[idxs])
feature_importance.update_layout(dict(
    yaxis_title=r"Importance [%]",
    xaxis_title="Feature",
))

Selecting features.

selector = SelectFromModel(
    rfc_selector,
    threshold=0.01,
    prefit=True
)
train_X_sel = selector.transform(train_X.values)
test_X_sel = selector.transform(test_X.values)
print(f"Selected {train_X_sel.shape[1]} features.")

Selected 37 features.

importances = rfc_selector.feature_importances_
idxs = np.argsort(importances)[::-1]  # sort descending

feature_importance_sel = px.bar(y=importances[idxs], x=X.columns[idxs])
feature_importance_sel.add_vline(x=train_X_sel.shape[1]+0.5, line_dash="dash", line_color="green")
feature_importance_sel.add_vrect(
    x0=-0.5, x1=train_X_sel.shape[1]+0.5, line_width=0, fillcolor="green", opacity=0.2
)
feature_importance_sel.update_layout(dict(
    yaxis_title=r"Importance [%]",
    xaxis_title="Feature",
))

Feature extraction via Principal Component Analysis¶

pca = PCA()
pca.fit(train_X.values)

pca_expl_var = px.line(y=np.cumsum(pca.explained_variance_ratio_), line_shape="hv",
                       color=px.Constant("Cumulative explained variance"))
pca_expl_var.add_bar(y=pca.explained_variance_ratio_, name="Explained variance")
# fill and vertical line at x=13
pca_expl_var.add_vline(x=13, line_dash="dash", line_color="green")
pca_expl_var.add_vrect(x0=-0.5, x1=13, line_width=0, fillcolor="green", opacity=0.2)
# fill and vertical line at x=34
pca_expl_var.add_vline(x=train_X_sel.shape[1]+0.5, line_dash="dash", line_color="red")
pca_expl_var.add_vrect(x0=13, x1=train_X_sel.shape[1]+0.5, line_width=0, fillcolor="red", opacity=0.2)
pca_expl_var.update_layout(dict(
    yaxis_title=r"% of explained variance",
    xaxis_title="Number of principal components",
))

With 13 PCs Cumulative explained variance > 0.9.

extractor_min = PCA(n_components=13)
train_X_ext_min = extractor_min.fit_transform(train_X.values)
test_X_ext_min = extractor_min.fit_transform(test_X.values)
print(f"Extracted {train_X_ext_min.shape[1]} features.")

Extracted 13 features.

What if number of PCs is equal to number of features selected with RFC?

extractor_max = PCA(n_components=train_X_sel.shape[1])
train_X_ext_max = extractor_max.fit_transform(train_X.values)
test_X_ext_max = extractor_max.fit_transform(test_X.values)
print(f"Extracted {train_X_ext_max.shape[1]} features.")

Extracted 37 features.

Model evaluation after feature selection/extraction¶

Training Random Forest Classifier using selected features.

rfc_grid_search.fit(train_X_sel, train_y)
print(rfc_grid_search.best_params_)
rfc_v2 = rfc_grid_search.best_estimator_
evaluate(rfc_v2, test_X_sel, test_y, "rfc_sel")

{'max_depth': 3, 'n_estimators': 50, 'random_state': 1}

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        39
           1       0.20      1.00      0.33         1

    accuracy                           0.90        40
   macro avg       0.60      0.95      0.64        40
weighted avg       0.98      0.90      0.93        40

Training Random Forest Classifier using extracted features (less PCs).

rfc_grid_search.fit(train_X_ext_min, train_y)
print(rfc_grid_search.best_params_)
rfc_v3 = rfc_grid_search.best_estimator_
evaluate(rfc_v3, test_X_ext_min, test_y, "rfc_ext_less")

{'max_depth': 3, 'n_estimators': 50, 'random_state': 1}

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       1.00      0.88      0.93        40
           1       0.00      1.00      0.00         0

    accuracy                           0.88        40
   macro avg       0.50      0.94      0.47        40
weighted avg       1.00      0.88      0.93        40

Training Random Forest Classifier using extracted features (more PCs).

rfc_grid_search.fit(train_X_ext_max, train_y)
print(rfc_grid_search.best_params_)
rfc_v4 = rfc_grid_search.best_estimator_
evaluate(rfc_v4, test_X_ext_max, test_y, "rfc_ext_more")

{'max_depth': 5, 'n_estimators': 10, 'random_state': 1}

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        38
           1       0.40      1.00      0.57         2

    accuracy                           0.93        40
   macro avg       0.70      0.96      0.77        40
weighted avg       0.97      0.93      0.94        40

Training Logistic Regression Classifier using selected features.

lrc_grid_search.fit(train_X_sel, train_y)
print(lrc_grid_search.best_params_)
lrc_v2 = lrc_grid_search.best_estimator_
evaluate(lrc_v2, test_X_sel, test_y, "lrc_sel")

{'lrc__C': 0.1, 'lrc__max_iter': 1000, 'lrc__penalty': 'l2', 'lrc__random_state': 1, 'lrc__solver': 'lbfgs'}

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        35
           1       0.60      0.60      0.60         5

    accuracy                           0.90        40
   macro avg       0.77      0.77      0.77        40
weighted avg       0.90      0.90      0.90        40

Training Logistic Regression Classifier using extracted features (less PCs).

lrc_grid_search.fit(train_X_ext_min, train_y)
print(lrc_grid_search.best_params_)
lrc_v3 = lrc_grid_search.best_estimator_
evaluate(lrc_v3, test_X_ext_min, test_y, "lrc_ext_min")

{'lrc__C': 0.01, 'lrc__max_iter': 1000, 'lrc__penalty': 'l2', 'lrc__random_state': 1, 'lrc__solver': 'liblinear'}

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93        36
           1       0.40      0.50      0.44         4

    accuracy                           0.88        40
   macro avg       0.67      0.71      0.69        40
weighted avg       0.89      0.88      0.88        40

Training Logistic Regression Classifier using extracted features (more PCs).

lrc_grid_search.fit(train_X_ext_max, train_y)
print(lrc_grid_search.best_params_)
lrc_v4 = lrc_grid_search.best_estimator_
evaluate(lrc_v4, test_X_ext_max, test_y, "lrc_ext_max")

{'lrc__C': 0.1, 'lrc__max_iter': 1000, 'lrc__penalty': 'l2', 'lrc__random_state': 1, 'lrc__solver': 'lbfgs'}

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        35
           1       0.40      0.40      0.40         5

    accuracy                           0.85        40
   macro avg       0.66      0.66      0.66        40
weighted avg       0.85      0.85      0.85        40

Model comparison¶

models = list(acc_scores.keys())
model_scores = list(acc_scores.values())
model_comp = px.bar(x=models, y=model_scores)
model_comp.update_layout(dict(
    yaxis_title="Accuracy",
    xaxis_title="Model",
))

for t, f, fig_title in zip(tprs.values(), fprs.values(), tprs.keys()):
    fig = px.line(x=f, y=t, width=1000, height=1000)
    fig.update_layout(dict(
        title=fig_title,
        xaxis_title="FPR",
        yaxis_title="TPR",
        shapes=[
            dict(
                type='line', line_dash="dash",
                yref='y', y0=0, y1=1,
                xref='x', x0=0, x1= 1
            )
        ]
    ))
    fig.show()

px.bar(y=list(auc_scores.values()), x=list(auc_scores.keys()))
model_comp.update_layout(dict(
    yaxis_title="AUC",
    xaxis_title="Model",
))

	SID	A2M	A4GNT	AAAS	AACS	AAK1	AAMP	AAT	AARS	AATK	...	ZNF91	ZNF93	ZSWIM1	ZYX	StudyID	Gender	Age	Type	PRISM	Died
0	GSM102976	2.243748	1.107010	3.587216	4.182330	6.447120	4.505749	2.092167	4.792785	3.856639	...	6.410246	4.071349	2.679265	9.491001	GSE66099	male	1.6	Discovery	20.0	1
1	GSM102977	1.797216	1.266728	4.418469	5.010960	7.650317	4.419486	1.348086	5.723700	3.482758	...	7.790634	4.550722	3.077656	11.010183	GSE66099	male	1.0	Discovery	26.0	0
2	GSM102978	2.402198	0.946330	3.842511	3.577492	6.975271	3.626157	1.960517	3.973704	3.697106	...	5.770187	5.143331	3.308207	8.247944	GSE66099	male	10.1	Discovery	35.0	1
3	GSM102979	2.038175	1.134434	3.790135	4.081478	5.949536	4.077177	1.670026	5.702786	3.702048	...	7.241728	3.894485	3.011332	10.727581	GSE66099	male	14.8	Discovery	22.0	1
4	GSM102980	2.290560	1.065330	4.904727	4.843061	7.153770	5.816867	1.638090	8.685689	3.370231	...	7.543954	4.992427	2.679478	10.326431	GSE66099	male	0.0	Discovery	20.0	1

	SLC39A8	RHAG	DDIT4	MPO	RRM2	CCL3	MAFF	TYMS	ENPP2	KIAA0101	...	EMR3	HSPA6	FGL2	RGS2	CPVL	CCR2	TGFBI	Gender	Age	PRISM
0	7.837760	4.112866	8.046405	2.324069	4.759059	3.140721	8.578116	5.230842	3.862937	5.590981	...	2.135084	9.884938	11.812192	8.859253	5.722329	7.817754	4.143039	1	1.6	20.0
1	7.751983	1.485795	8.971049	5.258546	7.684430	2.811141	4.715507	7.567845	3.212268	6.279265	...	1.673705	8.728279	9.303093	10.367759	6.251313	8.707475	4.977650	1	1.0	26.0
2	10.279053	1.925620	8.712714	1.856427	5.890788	5.947866	9.109963	6.066830	3.086101	4.688248	...	1.706595	8.115265	11.027659	8.161037	2.841639	2.932313	4.036440	1	10.1	35.0
3	9.601609	5.888388	8.527441	5.750055	8.206514	5.134266	5.156271	8.057016	2.684719	7.520501	...	1.464756	9.028702	9.271985	9.948187	6.095365	8.859346	4.616064	1	14.8	22.0
4	7.785195	4.846312	9.301229	8.695507	10.032503	6.555199	8.041716	9.813448	7.125344	7.953979	...	1.704964	7.833078	10.653802	8.848570	4.970816	5.720992	6.542890	1	0.0	20.0