SIRS morbidity prediction from expression profiles

Available at github.


Libraries and settings

In [1]:
# IMPORTS

# base
from warnings import simplefilter

# external
from plotly.offline import init_notebook_mode
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning, FitFailedWarning
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import plotly.express as px

# local
from utils.utils import *  # import all functions and global variables

# NOTEBOOK SETTINGS
init_notebook_mode(connected=True)
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action="ignore", category=UserWarning)
simplefilter(action="ignore", category=ConvergenceWarning)
simplefilter(action="ignore", category=FitFailedWarning)

Pipeline flowchart


Global variables

In [2]:
data_file = "data/GSE66099.csv"
DEGs_file = "data/GSE66099_DEGs.txt"  # None -> analysis of all genes

label = "Died"
metadata_cols = ["Gender", "Age", "PRISM"]
misc_cols = ["SID", "StudyID", "Type"]

random_seed = 1

Feature description

Feature Description
SID Patient ID
A2M - ZYX Investigated genes
StudyID GEO study accession number
Gender male/female
Age 0.0 - 14.8
PRISM Pediatric RISc of Mortality
Died Did the patient die? 1 - yes, 0 - no

Loading the data

In [3]:
input = pd.read_csv(data_file)
input.head()
Out[3]:
SID A2M A4GNT AAAS AACS AAK1 AAMP AAT AARS AATK ... ZNF91 ZNF93 ZSWIM1 ZYX StudyID Gender Age Type PRISM Died
0 GSM102976 2.243748 1.107010 3.587216 4.182330 6.447120 4.505749 2.092167 4.792785 3.856639 ... 6.410246 4.071349 2.679265 9.491001 GSE66099 male 1.6 Discovery 20.0 1
1 GSM102977 1.797216 1.266728 4.418469 5.010960 7.650317 4.419486 1.348086 5.723700 3.482758 ... 7.790634 4.550722 3.077656 11.010183 GSE66099 male 1.0 Discovery 26.0 0
2 GSM102978 2.402198 0.946330 3.842511 3.577492 6.975271 3.626157 1.960517 3.973704 3.697106 ... 5.770187 5.143331 3.308207 8.247944 GSE66099 male 10.1 Discovery 35.0 1
3 GSM102979 2.038175 1.134434 3.790135 4.081478 5.949536 4.077177 1.670026 5.702786 3.702048 ... 7.241728 3.894485 3.011332 10.727581 GSE66099 male 14.8 Discovery 22.0 1
4 GSM102980 2.290560 1.065330 4.904727 4.843061 7.153770 5.816867 1.638090 8.685689 3.370231 ... 7.543954 4.992427 2.679478 10.326431 GSE66099 male 0.0 Discovery 20.0 1

5 rows × 10603 columns

In [4]:
# gene expression profile
metadata = input[metadata_cols]
expression_profile = input.drop(columns=misc_cols + [label] + metadata_cols)
y = input[label].values.T

# select differentially expressed genes (DEGs)
if DEGs_file:
    DEGs = pd.read_csv(DEGs_file, header=None).values
    DEGs = DEGs.reshape(len(DEGs))
    expression_profile = expression_profile[DEGs]

# concatenate expression profile and metadata
X = pd.concat([expression_profile, metadata], axis=1)

# convert gender to numeric data
X["Gender"] = pd.get_dummies(X["Gender"], drop_first=True).astype(int)
X.head()
Out[4]:
SLC39A8 RHAG DDIT4 MPO RRM2 CCL3 MAFF TYMS ENPP2 KIAA0101 ... EMR3 HSPA6 FGL2 RGS2 CPVL CCR2 TGFBI Gender Age PRISM
0 7.837760 4.112866 8.046405 2.324069 4.759059 3.140721 8.578116 5.230842 3.862937 5.590981 ... 2.135084 9.884938 11.812192 8.859253 5.722329 7.817754 4.143039 1 1.6 20.0
1 7.751983 1.485795 8.971049 5.258546 7.684430 2.811141 4.715507 7.567845 3.212268 6.279265 ... 1.673705 8.728279 9.303093 10.367759 6.251313 8.707475 4.977650 1 1.0 26.0
2 10.279053 1.925620 8.712714 1.856427 5.890788 5.947866 9.109963 6.066830 3.086101 4.688248 ... 1.706595 8.115265 11.027659 8.161037 2.841639 2.932313 4.036440 1 10.1 35.0
3 9.601609 5.888388 8.527441 5.750055 8.206514 5.134266 5.156271 8.057016 2.684719 7.520501 ... 1.464756 9.028702 9.271985 9.948187 6.095365 8.859346 4.616064 1 14.8 22.0
4 7.785195 4.846312 9.301229 8.695507 10.032503 6.555199 8.041716 9.813448 7.125344 7.953979 ... 1.704964 7.833078 10.653802 8.848570 4.970816 5.720992 6.542890 1 0.0 20.0

5 rows × 111 columns

Visualisation of metadata

In [5]:
meta_df = input[[label] + metadata_cols]
for col in meta_df:
    value_dict = dict(meta_df[col].value_counts())
    categories = [col for _ in value_dict]
    metadata_vcs = [list(value_dict.values()), list(value_dict.keys()), categories]
    metadata_vcs = np.array(metadata_vcs).T
    metadata_vcs = pd.DataFrame(metadata_vcs, columns=["Count", "Label", "Category"])
    metadata_bar = px.histogram(metadata_vcs, y="Category", color="Label", x="Count", orientation="h")
    metadata_bar.update_layout(dict(
        title=f"{col} proportions",
        yaxis_title="",
        xaxis_title="Counts",
        yaxis_showticklabels=False,
        xaxis_showticklabels=False,
        legend_title=col,
        width=1000,
        height=300,
    ))
    metadata_bar.show()

Visualisation of DEGs' expression profiles

In [6]:
# only plot expression profile when using DEGs_file
# otherwise the plot is very large and demanding for GPU to render
if DEGs_file:
    ep_heatmap_all = px.imshow(expression_profile.values, x=DEGs, width=2000, height=500)
    ep_heatmap_all.update_layout(dict(
        title="Expression profiles of all patients",
        yaxis_title="",
        yaxis_showticklabels=False,
        xaxis_title="Gene",
        coloraxis_colorbar=dict(title="Expression level")
    ))
    ep_heatmap_all.show()
In [7]:
if DEGs_file:
    ep_heatmap1 = px.imshow(expression_profile.values[np.where(y == 1)],
                               x=DEGs, width=2000, height=500)
    ep_heatmap1.update_layout(dict(
        title="Expression profiles of patients who died",
        yaxis_title="",
        yaxis_showticklabels=False,
        xaxis_title="Gene",
        coloraxis_colorbar=dict(title="Expression level")
    ))
    ep_heatmap1.show()
In [8]:
if DEGs_file:
    ep_heatmap0 = px.imshow(expression_profile.values[np.where(y == 0)],
                               x=DEGs, width=2000, height=500)
    ep_heatmap0.update_layout(dict(
        title="Expression profiles of patients who did not die",
        yaxis_title="",
        yaxis_showticklabels=False,
        xaxis_title="Gene",
        coloraxis_colorbar=dict(title="Expression level")
    ))
    ep_heatmap0.show()

Finding the best model and data transformations

Splitting the dataset

Due to the small number of observations (199) only 20% is selected as a test set.

In [9]:
train_X, test_X, train_y, test_y = train_test_split(
    X, y, test_size=0.2, random_state=random_seed
)

Model initialization and parameter grids

Hyperparameter tuning was done automatically using grid_search() method.

Random Forest Classifier

In [10]:
rfc = RandomForestClassifier()
rfc.get_params()
Out[10]:
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
In [11]:
rfc_param_grid = dict(
    random_state=[random_seed],
    n_estimators=[10, 50, 100, 250, 500],
    max_depth=range(2, 6),
)

rfc_grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=rfc_param_grid
)

Logistic Regression

In [12]:
lrc = Pipeline([
    ("std", StandardScaler()),
    ("lrc", LogisticRegression())
])
lrc.get_params()
Out[12]:
{'memory': None,
 'steps': [('std', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('lrc',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                      warm_start=False))],
 'verbose': False,
 'std': StandardScaler(copy=True, with_mean=True, with_std=True),
 'lrc': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 'std__copy': True,
 'std__with_mean': True,
 'std__with_std': True,
 'lrc__C': 1.0,
 'lrc__class_weight': None,
 'lrc__dual': False,
 'lrc__fit_intercept': True,
 'lrc__intercept_scaling': 1,
 'lrc__l1_ratio': None,
 'lrc__max_iter': 100,
 'lrc__multi_class': 'auto',
 'lrc__n_jobs': None,
 'lrc__penalty': 'l2',
 'lrc__random_state': None,
 'lrc__solver': 'lbfgs',
 'lrc__tol': 0.0001,
 'lrc__verbose': 0,
 'lrc__warm_start': False}
In [13]:
lrc_param_grid = dict(
    lrc__random_state=[random_seed],
    lrc__C=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    lrc__penalty=["l2", "none"],
    lrc__solver=["lbfgs", "liblinear", "newton-cg", "sag", "saga"],
    lrc__max_iter=[1000]
)

lrc_grid_search = GridSearchCV(
    estimator=lrc,
    param_grid=lrc_param_grid
)

Model evaluation before feature selection/extraction

Training Random Forest Classifier on all DEGs.

In [14]:
rfc_grid_search.fit(train_X, train_y)
print(rfc_grid_search.best_params_)
rfc_v1 = rfc_grid_search.best_estimator_
evaluate(rfc_v1, test_X, test_y, "rfc_all")
{'max_depth': 4, 'n_estimators': 50, 'random_state': 1}
CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        32
           1       0.60      0.38      0.46         8

    accuracy                           0.82        40
   macro avg       0.73      0.66      0.68        40
weighted avg       0.81      0.82      0.81        40

Training Logistic Regression Classifier on all DEGs.

In [15]:
lrc_grid_search.fit(train_X, train_y)
print(lrc_grid_search.best_params_)
lrc_v1 = lrc_grid_search.best_estimator_
evaluate(lrc_v1, test_X, test_y, "lrc_all")
{'lrc__C': 0.01, 'lrc__max_iter': 1000, 'lrc__penalty': 'l2', 'lrc__random_state': 1, 'lrc__solver': 'lbfgs'}
CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        35
           1       0.60      0.60      0.60         5

    accuracy                           0.90        40
   macro avg       0.77      0.77      0.77        40
weighted avg       0.90      0.90      0.90        40

Feature selection via Random Forest Classifier

Training again on the whole dataset to not decrease selector performance.

In [16]:
rfc_grid_search.fit(X, y)
print(rfc_grid_search.best_params_)
rfc_selector = rfc_grid_search.best_estimator_
{'max_depth': 5, 'n_estimators': 250, 'random_state': 1}

Visualising feature importance.

In [17]:
importances = rfc_selector.feature_importances_
idxs = np.argsort(importances)[::-1]  # sort descending

feature_importance = px.bar(y=importances[idxs], x=X.columns[idxs])
feature_importance.update_layout(dict(
    yaxis_title=r"Importance [%]",
    xaxis_title="Feature",
))

Selecting features.

In [18]:
selector = SelectFromModel(
    rfc_selector,
    threshold=0.01,
    prefit=True
)
train_X_sel = selector.transform(train_X.values)
test_X_sel = selector.transform(test_X.values)
print(f"Selected {train_X_sel.shape[1]} features.")
Selected 37 features.
In [19]:
importances = rfc_selector.feature_importances_
idxs = np.argsort(importances)[::-1]  # sort descending

feature_importance_sel = px.bar(y=importances[idxs], x=X.columns[idxs])
feature_importance_sel.add_vline(x=train_X_sel.shape[1]+0.5, line_dash="dash", line_color="green")
feature_importance_sel.add_vrect(
    x0=-0.5, x1=train_X_sel.shape[1]+0.5, line_width=0, fillcolor="green", opacity=0.2
)
feature_importance_sel.update_layout(dict(
    yaxis_title=r"Importance [%]",
    xaxis_title="Feature",
))

Feature extraction via Principal Component Analysis

In [20]:
pca = PCA()
pca.fit(train_X.values)

pca_expl_var = px.line(y=np.cumsum(pca.explained_variance_ratio_), line_shape="hv",
                       color=px.Constant("Cumulative explained variance"))
pca_expl_var.add_bar(y=pca.explained_variance_ratio_, name="Explained variance")
# fill and vertical line at x=13
pca_expl_var.add_vline(x=13, line_dash="dash", line_color="green")
pca_expl_var.add_vrect(x0=-0.5, x1=13, line_width=0, fillcolor="green", opacity=0.2)
# fill and vertical line at x=34
pca_expl_var.add_vline(x=train_X_sel.shape[1]+0.5, line_dash="dash", line_color="red")
pca_expl_var.add_vrect(x0=13, x1=train_X_sel.shape[1]+0.5, line_width=0, fillcolor="red", opacity=0.2)
pca_expl_var.update_layout(dict(
    yaxis_title=r"% of explained variance",
    xaxis_title="Number of principal components",
))

With 13 PCs Cumulative explained variance > 0.9.

In [21]:
extractor_min = PCA(n_components=13)
train_X_ext_min = extractor_min.fit_transform(train_X.values)
test_X_ext_min = extractor_min.fit_transform(test_X.values)
print(f"Extracted {train_X_ext_min.shape[1]} features.")
Extracted 13 features.

What if number of PCs is equal to number of features selected with RFC?

In [22]:
extractor_max = PCA(n_components=train_X_sel.shape[1])
train_X_ext_max = extractor_max.fit_transform(train_X.values)
test_X_ext_max = extractor_max.fit_transform(test_X.values)
print(f"Extracted {train_X_ext_max.shape[1]} features.")
Extracted 37 features.

Model evaluation after feature selection/extraction

Training Random Forest Classifier using selected features.

In [23]:
rfc_grid_search.fit(train_X_sel, train_y)
print(rfc_grid_search.best_params_)
rfc_v2 = rfc_grid_search.best_estimator_
evaluate(rfc_v2, test_X_sel, test_y, "rfc_sel")
{'max_depth': 3, 'n_estimators': 50, 'random_state': 1}
CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        39
           1       0.20      1.00      0.33         1

    accuracy                           0.90        40
   macro avg       0.60      0.95      0.64        40
weighted avg       0.98      0.90      0.93        40

Training Random Forest Classifier using extracted features (less PCs).

In [24]:
rfc_grid_search.fit(train_X_ext_min, train_y)
print(rfc_grid_search.best_params_)
rfc_v3 = rfc_grid_search.best_estimator_
evaluate(rfc_v3, test_X_ext_min, test_y, "rfc_ext_less")
{'max_depth': 3, 'n_estimators': 50, 'random_state': 1}
CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       1.00      0.88      0.93        40
           1       0.00      1.00      0.00         0

    accuracy                           0.88        40
   macro avg       0.50      0.94      0.47        40
weighted avg       1.00      0.88      0.93        40

Training Random Forest Classifier using extracted features (more PCs).

In [25]:
rfc_grid_search.fit(train_X_ext_max, train_y)
print(rfc_grid_search.best_params_)
rfc_v4 = rfc_grid_search.best_estimator_
evaluate(rfc_v4, test_X_ext_max, test_y, "rfc_ext_more")
{'max_depth': 5, 'n_estimators': 10, 'random_state': 1}
CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        38
           1       0.40      1.00      0.57         2

    accuracy                           0.93        40
   macro avg       0.70      0.96      0.77        40
weighted avg       0.97      0.93      0.94        40

Training Logistic Regression Classifier using selected features.

In [26]:
lrc_grid_search.fit(train_X_sel, train_y)
print(lrc_grid_search.best_params_)
lrc_v2 = lrc_grid_search.best_estimator_
evaluate(lrc_v2, test_X_sel, test_y, "lrc_sel")
{'lrc__C': 0.1, 'lrc__max_iter': 1000, 'lrc__penalty': 'l2', 'lrc__random_state': 1, 'lrc__solver': 'lbfgs'}
CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        35
           1       0.60      0.60      0.60         5

    accuracy                           0.90        40
   macro avg       0.77      0.77      0.77        40
weighted avg       0.90      0.90      0.90        40

Training Logistic Regression Classifier using extracted features (less PCs).

In [27]:
lrc_grid_search.fit(train_X_ext_min, train_y)
print(lrc_grid_search.best_params_)
lrc_v3 = lrc_grid_search.best_estimator_
evaluate(lrc_v3, test_X_ext_min, test_y, "lrc_ext_min")
{'lrc__C': 0.01, 'lrc__max_iter': 1000, 'lrc__penalty': 'l2', 'lrc__random_state': 1, 'lrc__solver': 'liblinear'}
CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93        36
           1       0.40      0.50      0.44         4

    accuracy                           0.88        40
   macro avg       0.67      0.71      0.69        40
weighted avg       0.89      0.88      0.88        40

Training Logistic Regression Classifier using extracted features (more PCs).

In [28]:
lrc_grid_search.fit(train_X_ext_max, train_y)
print(lrc_grid_search.best_params_)
lrc_v4 = lrc_grid_search.best_estimator_
evaluate(lrc_v4, test_X_ext_max, test_y, "lrc_ext_max")
{'lrc__C': 0.1, 'lrc__max_iter': 1000, 'lrc__penalty': 'l2', 'lrc__random_state': 1, 'lrc__solver': 'lbfgs'}
CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        35
           1       0.40      0.40      0.40         5

    accuracy                           0.85        40
   macro avg       0.66      0.66      0.66        40
weighted avg       0.85      0.85      0.85        40

Model comparison

In [29]:
models = list(acc_scores.keys())
model_scores = list(acc_scores.values())
model_comp = px.bar(x=models, y=model_scores)
model_comp.update_layout(dict(
    yaxis_title="Accuracy",
    xaxis_title="Model",
))
In [30]:
for t, f, fig_title in zip(tprs.values(), fprs.values(), tprs.keys()):
    fig = px.line(x=f, y=t, width=1000, height=1000)
    fig.update_layout(dict(
        title=fig_title,
        xaxis_title="FPR",
        yaxis_title="TPR",
        shapes=[
            dict(
                type='line', line_dash="dash",
                yref='y', y0=0, y1=1,
                xref='x', x0=0, x1= 1
            )
        ]
    ))
    fig.show()
In [31]:
px.bar(y=list(auc_scores.values()), x=list(auc_scores.keys()))
model_comp.update_layout(dict(
    yaxis_title="AUC",
    xaxis_title="Model",
))