commit
bb4d0014b6
1 changed files with 306 additions and 0 deletions
@ -0,0 +1,306 @@
@@ -0,0 +1,306 @@
|
||||
# -*- coding: utf-8 -*- |
||||
""" |
||||
Created on Wed May 19 09:23:49 2021 |
||||
|
||||
@author: Dijkhofmf |
||||
""" |
||||
|
||||
# Import stuff |
||||
|
||||
import os |
||||
import pandas as pd |
||||
import numpy as np |
||||
import matplotlib.pyplot as plt |
||||
import seaborn as sns |
||||
from scipy import stats |
||||
#from pca import pca |
||||
from sklearn import preprocessing |
||||
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut |
||||
from sklearn.cross_decomposition import PLSRegression |
||||
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score |
||||
|
||||
from sklearn.metrics import r2_score, accuracy_score, plot_confusion_matrix, confusion_matrix, roc_curve, auc, roc_auc_score, precision_recall_fscore_support, classification_report, mean_squared_error |
||||
|
||||
#%% Import data and path |
||||
|
||||
Path = 'I:\Mike Dijkhof\Connecare MGP\Data\FinalFiles' |
||||
|
||||
# Set path |
||||
os.chdir(Path) |
||||
|
||||
#%% Create DF |
||||
FinalDF = pd.DataFrame(pd.read_csv('FinalDataset.csv')) |
||||
|
||||
X = FinalDF.drop(['Pt Type'], axis=1) |
||||
X = X.drop(['Study ID'], axis=1) |
||||
|
||||
# #%% |
||||
|
||||
# plt.figure(dpi=720, figsize=(20,20)) |
||||
# heatmap = sns.heatmap(X.corr(), vmin=-1, vmax=1, annot=True) |
||||
# plt.title('Pearson correlation heatmap') |
||||
|
||||
# plt.figure(dpi=720, figsize=(20,20)) |
||||
# heatmap = sns.heatmap(X.corr(method='spearman'), vmin=-1, vmax=1, annot=True) |
||||
# plt.title('Spearman correlation heatmap') |
||||
|
||||
|
||||
#%% |
||||
cols = pd.DataFrame(X).columns |
||||
y = FinalDF['Pt Type'] |
||||
|
||||
y = y.replace('Complication', int(1)) |
||||
y = y.replace('Healthy', int(0)) |
||||
|
||||
X = X.dropna() |
||||
|
||||
y = y.loc[X.index] |
||||
y = y.astype(int) |
||||
|
||||
#X = X.reset_index() |
||||
#y = y.reset_index() |
||||
#y = y.drop('index', axis=1) |
||||
#X = X.drop('Study ID', axis=1) |
||||
|
||||
X = preprocessing.scale(X) |
||||
|
||||
#%% |
||||
|
||||
#PLS = PLSRegression(n_components=2, scale=True) |
||||
|
||||
# # Or reduce the data towards 2 PCs |
||||
# model = pca(n_components=8) |
||||
# # Fit transform |
||||
# results = model.fit_transform(X, y, col_labels=cols) |
||||
# # Plot explained variance |
||||
# fig, ax = model.plot() |
||||
# # Scatter first 2 PCs |
||||
# fig, ax = model.scatter() |
||||
# # Make biplot with the number of features |
||||
# fig, ax = model.biplot(n_feat=8) |
||||
|
||||
#%% |
||||
|
||||
def base_pls(X_train, y_train, X_test, n_components): |
||||
|
||||
# Define PLS model and fit it to the train set |
||||
PLS = PLSRegression(n_components=n_components) |
||||
PLS.fit(X_train, y_train) |
||||
|
||||
# Prediction on train set |
||||
y_c = PLS.predict(X_train) |
||||
|
||||
# Prediction on test set |
||||
y_p = PLS.predict(X_test) |
||||
|
||||
return(y_c, y_p) |
||||
|
||||
#%% |
||||
|
||||
cval = LeaveOneOut() |
||||
n_components = range(1,19) |
||||
|
||||
X = pd.DataFrame(X) |
||||
yu = pd.DataFrame(y) |
||||
|
||||
rmse_c = np.zeros(len(n_components)).astype('float32') |
||||
rmse_cv = np.zeros(len(n_components)).astype('float32') |
||||
R2_cv = np.zeros(len(n_components)).astype('float32') |
||||
Q2_cv = np.zeros(len(n_components)).astype('float32') |
||||
PRESScv = np.zeros(len(n_components)).astype('float32') |
||||
RSS_cv = np.zeros(len(n_components)).astype('float32') |
||||
WoldR = np.zeros(len(n_components)).astype('float32') |
||||
|
||||
for i in n_components: |
||||
|
||||
rmsec = [] |
||||
rmsecv = [] |
||||
PRESS = [] |
||||
RSS = [] |
||||
R2 = [] |
||||
Q2 = [] |
||||
|
||||
for train, test in cval.split(X): |
||||
|
||||
PLS = PLSRegression(n_components=i, scale=False) |
||||
PLS.fit_transform(X.iloc[train], y.iloc[train]) |
||||
|
||||
Y_pred_train = PLS.predict(X.iloc[train]) |
||||
Y_pred_test = PLS.predict(X.iloc[test]) |
||||
|
||||
rmsec.append(np.sqrt(mean_squared_error(y.iloc[train], Y_pred_train[:,0]))) |
||||
rmsecv.append(np.sqrt(mean_squared_error(y.iloc[test], Y_pred_test[:,0]))) |
||||
|
||||
y_value = y.iloc[test] |
||||
y_value = y_value.iloc[0] |
||||
|
||||
RSS.append((Y_pred_test - y_value)**2) |
||||
PRESS.append(mean_squared_error(y.iloc[test], Y_pred_test[:,0])) |
||||
R2Y = r2_score(y.iloc[train], Y_pred_train) |
||||
|
||||
|
||||
rmse_c[i-1] = np.mean(np.array(rmsec)) |
||||
rmse_cv[i-1] = np.mean(np.array(rmsecv)) |
||||
RSS_cv[i-1] = np.mean(np.array(RSS)) |
||||
PRESScv[i-1] = np.mean(np.array(PRESS)) |
||||
R2_cv[i-1] = np.mean(np.array(R2Y)) |
||||
if i == 1: |
||||
Q2_cv[0] = 0 |
||||
else: |
||||
Q2_cv[i-1] = (1 - (PRESScv[i-1]/RSS_cv[i-2])) |
||||
|
||||
|
||||
if i == 1: |
||||
WoldR[i-1] = 0 |
||||
elif i > 1: |
||||
WoldR[i-1] = (PRESScv[i-1]/PRESScv[i-2]) |
||||
|
||||
|
||||
|
||||
with plt.style.context(('seaborn')): |
||||
plt.plot(rmse_c, 'o-r', label = "RMSE Calibration") |
||||
plt.plot(rmse_cv, 'o-b', label = "RMSE Cross Validation") |
||||
plt.ylabel('RMSE') |
||||
plt.xlabel('Number of LV included') |
||||
plt.legend() |
||||
plt.show() |
||||
|
||||
with plt.style.context(('seaborn')): |
||||
plt.plot(R2_cv, 'o-r', label = "R2_score") |
||||
plt.plot(Q2_cv, 'o-b', label = "Q2_score") |
||||
plt.ylabel('Score') |
||||
plt.xlabel('Number of LV included') |
||||
plt.legend() |
||||
plt.show() |
||||
|
||||
with plt.style.context(('seaborn')): |
||||
plt.plot(WoldR, 'o-r', label = "Wold R") |
||||
plt.plot(PRESScv, 'o-b', label = "PRESS") |
||||
plt.ylabel('Score') |
||||
plt.xlabel('Number of LV included') |
||||
plt.legend() |
||||
plt.show() |
||||
|
||||
#%% Select n_components based on LOOCV: |
||||
|
||||
PLS = PLSRegression(n_components=6, scale=True) |
||||
|
||||
|
||||
PLS.fit_transform(X, y) |
||||
|
||||
Y_pred = PLS.predict(X) |
||||
bin_pred = (PLS.predict(X)[:,0] > 0.5).astype('uint8') |
||||
|
||||
|
||||
plt.figure(dpi=720) |
||||
plt.title('Confusion Matrix PLS-DA') |
||||
cf_rf = confusion_matrix(y, bin_pred) |
||||
sns.heatmap(cf_rf, annot=True, cmap='Blues') |
||||
plt.xlabel('Predicted outcomes') |
||||
plt.ylabel('True outcomes') |
||||
plt.show() |
||||
|
||||
print(classification_report(y, bin_pred)) |
||||
scores_PLS = precision_recall_fscore_support(y, bin_pred) |
||||
|
||||
#%% |
||||
|
||||
def vip(model): |
||||
|
||||
t = model.x_scores_ |
||||
w = model.x_weights_ |
||||
q = model.y_loadings_ |
||||
p, h = w.shape |
||||
|
||||
vips = np.zeros((p,)) |
||||
|
||||
s = np.diag(t.T @ t @ q.T @ q).reshape(h, -1) |
||||
total_s = np.sum(s) |
||||
|
||||
for i in range(p): |
||||
|
||||
weight = np.array([ (w[i,j] / np.linalg.norm(w[:,j]))**2 for j in range(h) ]) |
||||
vips[i] = np.sqrt(p*(s.T @ weight)/total_s) |
||||
|
||||
return vips |
||||
|
||||
VIP = vip(PLS) |
||||
|
||||
#%% |
||||
|
||||
cols = ['Age (years)', 'Gender', 'Daily alcohol use', 'Medication', |
||||
'ASA-classification', 'Recurrent disease?', 'Comorb', |
||||
'Independent, with others', 'Smokes cigarettes/sigar', 'BMI', 'GFI', |
||||
'HADS_A', 'HADS Depression', 'ADL', 'iADL', 'TUG', 'Handgrip strength', |
||||
'Avg. Steps/day', 'Avg. MVPA/day'] |
||||
|
||||
VIP = pd.DataFrame(VIP, columns=['VIP']) |
||||
VIP['labels'] = cols |
||||
VIP = VIP.sort_values('VIP', ascending=False) |
||||
|
||||
|
||||
#%% |
||||
|
||||
BCT = VIP['VIP'].describe()[6] + (1.5*( VIP['VIP'].describe()[6]- VIP['VIP'].describe()[4])) |
||||
|
||||
#%% |
||||
|
||||
VIP = VIP[VIP['VIP']>1] |
||||
|
||||
#%% |
||||
import matplotlib.pylab as pylab |
||||
|
||||
params = {'legend.fontsize': 'x-large', |
||||
'axes.labelsize': 'x-large', |
||||
'axes.titlesize':'x-large', |
||||
'xtick.labelsize':'x-large', |
||||
'ytick.labelsize':'x-large'} |
||||
|
||||
pylab.rcParams.update(params) |
||||
|
||||
|
||||
plt.figure(figsize=(12,7), dpi=720) |
||||
sns.set_style('whitegrid') |
||||
plt.bar(x=VIP['labels'], height=VIP['VIP']) |
||||
plt.title('VIP scores') |
||||
plt.yticks(fontsize='x-large') |
||||
plt.xticks(rotation=-90, fontsize='x-large') |
||||
|
||||
#%% Compute ROC curve and ROC area for each class |
||||
|
||||
fpr = dict() |
||||
tpr = dict() |
||||
roc_auc = dict() |
||||
|
||||
fpr, tpr, _ = roc_curve(y, bin_pred) |
||||
roc_auc = auc(fpr, tpr) |
||||
|
||||
# Compute micro-average ROC curve and ROC area |
||||
fpr_m, tpr_m, _ = roc_curve(y.ravel(), bin_pred.ravel()) |
||||
roc_auc_m = auc(fpr_m, tpr_m) |
||||
|
||||
plt.figure(dpi=720) |
||||
lw = 2 |
||||
plt.plot(fpr, tpr, color='darkorange', |
||||
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) |
||||
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') |
||||
plt.xlim([0.0, 1.0]) |
||||
plt.ylim([0.0, 1.05]) |
||||
plt.xlabel('False Positive Rate') |
||||
plt.ylabel('True Positive Rate') |
||||
plt.title('ROC-Curve PLS-DA') |
||||
plt.legend(loc="lower right") |
||||
plt.show() |
||||
|
||||
plt.figure(dpi=720) |
||||
lw = 2 |
||||
plt.plot(fpr_m, tpr_m, color='darkorange', |
||||
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc_m) |
||||
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') |
||||
plt.xlim([0.0, 1.0]) |
||||
plt.ylim([0.0, 1.05]) |
||||
plt.xlabel('False Positive Rate') |
||||
plt.ylabel('True Positive Rate') |
||||
plt.title('Receiver operating characteristic example') |
||||
plt.legend(loc="lower right") |
||||
plt.show() |
Loading…
Reference in new issue