Scripts to create a dataset from Redcap outputs to use for a PLS-DA classification.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

136 lines
5.0 KiB

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 8 10:38:31 2021
@author: Dijkhofmf
"""
# Import stuff
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.mode.chained_assignment = None # default='warn'
#%% Define filenames and path
FilenameComplete = 'Complete.csv'
FilenameDemo = 'DemoData.csv'
Filename_T0 = 'FinalDF_T0.csv'
Path = 'I:\Mike Dijkhof\Connecare MGP\Data\FinalFiles'
# Set path
os.chdir(Path)
DFComplete = pd.DataFrame(pd.read_csv(FilenameComplete))
DFDemo = pd.DataFrame(pd.read_csv(FilenameDemo))
DFDemo['Complete data'] = DFComplete['Has patient completed study?']
DFDemo = DFDemo.drop(DFDemo[DFDemo['Complete data'] !='Yes'].index)
DFDemo['ASA-classification'] = DFDemo['ASA-classification'].str.replace('ASA ', '').astype('float64')
DFDemo = DFDemo.replace('Unchecked', 0)
DFDemo = DFDemo.replace('Checked', 1)
Dropcols = ['Year of birth', 'Subject ID Connecare', 'Subject ID Connecare (version 2.0)','Date subject signed consent', 'Nationality', 'Language', 'Former occupation',
'Does the patient have a smartphone that they use?', 'How many days a week is the smartphone used?',
'Does the patient have a tablet that they use?','How many days a week is the tablet used?','Does the patient have a computer/pc that they use?',
'How many days a week is the computer/pc used?','Smart device at home', 'Smart device at inclusion? (check all that apply) (choice=Fitbit)',
'Smart device at inclusion? (check all that apply) (choice=Weight scale)','Indication Surgery', 'Comments', 'Complete?', 'Complete data']
DFDemo = DFDemo.drop(Dropcols, axis=1)
DFDemo = DFDemo.set_index('Study ID')
# Calculate CCI score
DFDemo.iloc[:,20:26] = DFDemo.iloc[:,20:26]*2
DFDemo.iloc[:,26] = DFDemo.iloc[:,26]*3
DFDemo.iloc[:,26:28] = DFDemo.iloc[:,26:28]*6
ColMask = DFDemo.columns[10:29]
DFDemo['Comorb'] = DFDemo[ColMask].sum(axis=1)
DFDemo = DFDemo.drop(ColMask, axis=1)
#%%
DF_T0 = pd.DataFrame(pd.read_csv(Filename_T0))
DF_T0 = DF_T0.set_index('Study ID')
DFDemo['Type'] = DF_T0['Pt Type']
#%% code variables
DFDemo['Gender'] = DFDemo['Gender'].replace('Female', 0)
DFDemo['Gender'] = DFDemo['Gender'].replace('Male', 1)
Housing = pd.get_dummies(DFDemo['Housing'], drop_first=True)
Education = pd.get_dummies(DFDemo['Education'], drop_first=True)
Smoking = pd.get_dummies(DFDemo['Smoking'], drop_first=True)
Med_Dif = pd.get_dummies(DFDemo['Difficulty preparing medication?'], drop_first=True)
Loc_Tu = pd.get_dummies(DFDemo['Location tumour'], drop_first=True)
Prim_Mal = pd.get_dummies(DFDemo['Primary Malignancy'], drop_first=True)
DFDemo['Recurrent disease?'] = DFDemo['Recurrent disease?'].replace('No', 0)
DFDemo['Recurrent disease?'] = DFDemo['Recurrent disease?'].replace('Yes', 1)
DFDemo = DFDemo.drop(['Marital State', 'Housing', 'Education', 'Tumour Stage', 'Smoking', 'Difficulty preparing medication?',
'Location tumour', 'Primary Malignancy'], axis=1)
#%%
DFDemo = pd.concat([DFDemo, Housing, Education, Smoking, Med_Dif, Loc_Tu, Prim_Mal], axis=1)
#%% Create Neoadjuvant therapy variable
for i,r in DFDemo.iterrows():
if (DFDemo.loc[i,'Neo-adjuvant therapy (choice=Chemotherapy)'] == 1) & (DFDemo.loc[i,'Neo-adjuvant therapy (choice=Radiotherapy)'] == 1):
DFDemo.loc[i,'Neo'] = 1
elif DFDemo.loc[i, 'Neo-adjuvant therapy (choice=Chemotherapy)'] == 1:
DFDemo.loc[i,'Neo'] = 2
elif DFDemo.loc[i,'Neo-adjuvant therapy (choice=Immunotherapy)'] == 1:
DFDemo.loc[i,'Neo'] = 3
elif DFDemo.loc[i,'Neo-adjuvant therapy (choice=Radiotherapy)'] == 1:
DFDemo.loc[i,'Neo'] = 4
elif DFDemo.loc[i,'Neo-adjuvant therapy (choice=Targeted Therapy)'] == 1:
DFDemo.loc[i,'Neo'] = 5
elif DFDemo.loc[i,'Neo-adjuvant therapy (choice=None)'] == 1:
DFDemo.loc[i,'Neo'] = 0
Neo = pd.get_dummies(DFDemo['Neo'], drop_first=True)
NeoDrop = ['Neo-adjuvant therapy (choice=Chemotherapy)','Neo-adjuvant therapy (choice=Chemotherapy)','Neo-adjuvant therapy (choice=Immunotherapy)',
'Neo-adjuvant therapy (choice=Radiotherapy)', 'Neo-adjuvant therapy (choice=None)', 'Neo-adjuvant therapy (choice=Targeted Therapy)', 'Neo']
DFDemo = DFDemo.drop(NeoDrop, axis=1)
DFDemo = pd.concat([DFDemo, Neo], axis=1)
#%%
plt.figure()
sns.displot(DFDemo['Age (years)'])
#%%
DemoComp = DFDemo[DFDemo['Type'] != 'Healthy']
DemoComp = DemoComp.drop('Type', axis=1)
DemoNoComp = DFDemo[DFDemo['Type'] == 'Healthy']
DemoNoComp = DemoNoComp.drop('Type', axis=1)
from scipy import stats
#outcome = pd.DataFrame(index=['stat', 'p-value'])
outcomeT = stats.ttest_ind(DemoNoComp, DemoComp, nan_policy='omit')
OutcomeT = outcomeT[1].tolist()
OutcomeMW = []
for column in DemoComp:
print(column)
outcomeMW = stats.mannwhitneyu(DemoNoComp[column], DemoComp[column])
OutcomeMW.append(outcomeMW[1])
#DFDemo.to_csv('FinalDemo.csv')