forked from DigitalBiomarkerDiscoveryPipeline/CovIdentify
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ita_model_development.py
260 lines (180 loc) · 11.7 KB
/
ita_model_development.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
ITA Model Development
@author: mobashirshandhi
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
### function to get summary statistics of deviation metrics from time series wearable data baseline to detection period
def get_delta_summary_stat(wearable_daily_df, baseline_start_date,baseline_end_date,detection_start_date,detection_end_date):
### Example baseline and detection period start and end date
### baseline_start_date=-60
### baseline_end_date=-22
### detection_start_date=-5
### detection_end_date=-1
## get separate dataframe for baseline and detection period
wearable_daily_df_bl=wearable_daily_df.loc[(wearable_daily_df['days_from_test'] >= baseline_start_date) & (wearable_daily_df['days_from_test'] <= baseline_end_date)]
wearable_daily_df_detection=wearable_daily_df.loc[(wearable_daily_df['days_from_test'] >= detection_start_date) & (wearable_daily_df['days_from_test'] <= detection_end_date)]
wearable_daily_df_detection.reset_index(inplace=True,drop=True)
wearable_daily_df_bl.reset_index(inplace=True,drop=True)
## get subjectwise summary (median, mean, std, iqr) for baseline period
wearable_daily_df_bl_subject_wise_summary=wearable_daily_df_bl.groupby(['participant_id'],as_index=True).agg(
baseline_median = ('Value', np.median),
baseline_mean = ('Value', np.mean),
baseline_std = ('Value', np.std),
baseline_iqr = ('Value', stats.iqr)
)
# get deviation metrics (deviation from baseline) for each day in the detection period
## adding bl summary value with the detection df
wearable_daily_df_detection = wearable_daily_df_detection.join(wearable_daily_df_bl_subject_wise_summary, on="participant_id")
## calculating deviation metrics
wearable_daily_df_detection['Delta']=wearable_daily_df_detection['Value']-wearable_daily_df_detection['baseline_median']
wearable_daily_df_detection['Delta_Normalized']=wearable_daily_df_detection['Delta']/wearable_daily_df_detection['baseline_median']
wearable_daily_df_detection['Delta_standardized']=wearable_daily_df_detection['Delta']/wearable_daily_df_detection['baseline_iqr']
wearable_daily_df_detection['Z_score']=(wearable_daily_df_detection['Value']-wearable_daily_df_detection['baseline_mean'])/wearable_daily_df_detection['baseline_std']
## get summary statistics for each deviation metrics
wearable_daily_df_detection_summary_stat = wearable_daily_df_detection.groupby('participant_id').agg(
delta_avg = ('Delta', np.mean),
delta_max = ('Delta', max),
delta_min = ('Delta', min),
delta_range = ('Delta', np.ptp),
delta_median = ('Delta', np.median),
normalized_delta_avg = ('Delta_Normalized', np.mean),
normalized_delta_max = ('Delta_Normalized', max),
normalized_delta_min = ('Delta_Normalized', min),
normalized_delta_range = ('Delta_Normalized', np.ptp),
normalized_delta_median = ('Delta_Normalized', np.median),
standardized_delta_avg = ('Delta_standardized', np.mean),
standardized_delta_max = ('Delta_standardized', max),
standardized_delta_min = ('Delta_standardized', min),
standardized_delta_range = ('Delta_standardized', np.ptp),
standardized_delta_median = ('Delta_standardized', np.median),
zscore_delta_avg = ('Z_score', np.mean),
zscore_delta_max = ('Z_score', max),
zscore_delta_min = ('Z_score', min),
zscore_delta_range = ('Z_score', np.ptp),
zscore_delta_median = ('Z_score', np.median),
covid_status=('covid_status', np.mean)
)
return wearable_daily_df_detection_summary_stat
### Function Run two sample test on features based on a grouping variable (covid_status) and return p-value plus summary metrics
def parametric_two_sample_ttest(df):
column_list = [x for x in df.columns if x != 'covid_status']
# create an empty dictionary
t_test_results = {}
# loop over column_list and execute code
for column in column_list:
group1 = df.where(df.covid_status== 1).dropna()[column] # for covid positive group
group2 = df.where(df.covid_status== 0).dropna()[column] # for covid positive group
# add the output to the dictionary
t_test_results[column] = stats.ttest_ind(group1,group2)
results_df = pd.DataFrame.from_dict(t_test_results,orient='Index')
results_df.columns = ['statistic','pvalue']
results_df.drop(columns=['statistic'],inplace=True)
return results_df
### Function to train a model on a dataset with nested cross validation with outer loop as model selection and inner loop for hyperparameter tuning
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,cross_val_predict,StratifiedKFold
from sklearn.preprocessing import StandardScaler
## define nested cross validation
def nested_cross_validate_predict_proba(X, y, model, param_grid):
# configure the cross-validation procedure
cv_inner = StratifiedKFold(5)
cv_outer=StratifiedKFold(10)
# set up the inner CV loop
clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring='recall', cv=cv_inner,n_jobs=-1)
# set up outer CV loop
pipe = Pipeline(steps=[('scaler', StandardScaler()), ('model', clf)])
# execute the nested cross-validation
probabilities = cross_val_predict(pipe, X, y,cv=cv_outer,n_jobs = -1,verbose = True,method='predict_proba')
return probabilities
### Import Preprocessed wearable data with daily RHR and step count data
# Column Names:participant_id, covid_status, rhr, step_count, days_from_test
# covid_status is a integer value: covid positive (1)/ covid negative (0)
wearable_df = pd.read_csv('wearable_digital_biomarkers.csv', index_col = False)
#defining baseline and detection period
baseline_start_date=-60
baseline_end_date=-22
detection_start_date=-5
detection_end_date=-1
### Extract Features from time series wearable data
#get features for RHR
wearable_rhr_feature_df=get_delta_summary_stat(wearable_df.rename(columns = {'rhr': 'Value'}),baseline_start_date, baseline_end_date, detection_start_date,detection_end_date) # 3 days of Detecion Period
wearable_rhr_feature_df=wearable_rhr_feature_df.add_prefix('rhr_')
#get features for steps
wearable_step_feature_df=get_delta_summary_stat(wearable_df.rename(columns = {'step_count': 'Value'}),baseline_start_date, baseline_end_date, detection_start_date,detection_end_date)
wearable_step_feature_df=wearable_step_feature_df.add_prefix('step_')
wearable_feature_df=wearable_step_feature_df.join(wearable_rhr_feature_df,on="participant_id")
#get covid_status from both rhr and step data
wearable_feature_df['covid_status']=wearable_feature_df[['rhr_covid_status','step_covid_status']].max(axis=1)
### split the dataset into training and test set with a 80-20 ratio
### keep the ratio of the covid positive and negative participants the same in both training and test set
from sklearn.model_selection import train_test_split
participants_list=wearable_feature_df.index.tolist()
covid_status=wearable_feature_df['covid_status']
participants_list_train, participants_list_test, covid_status_train,covid_status_test=train_test_split(participants_list, covid_status, test_size=0.2, random_state=42, stratify=covid_status)
wearable_feature_train_df = wearable_feature_df[wearable_feature_df['participant_id'].isin(participants_list_train)]
wearable_feature_test_df = wearable_feature_df[wearable_feature_df['participant_id'].isin(participants_list_test)]
### feature selection using statistical analysis on the training set
### run two sample ttest on the training set and only use the features that statistically significant between covid positive and negative groups
wearable_feature_train_df.reset_index(inplace=True)
stat_results=parametric_two_sample_ttest(wearable_feature_train_df.drop(columns=['participant_id', 'rhr_covid_status','step_covid_status'])) # only pass the columns with features and covid_Status
# adjust p-values using Benjamini-Hochberg procedure for multiple hypotheses testing
from statsmodels.stats.multitest import multipletests
adjusted_p_value=multipletests(stat_results['pvalue'], alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)
stat_results['pvalue']=adjusted_p_value[1]
#get the list of significant features with p-value < 0.05
stat_results.reset_index(inplace=True)
significant_features=stat_results[stat_results['pvalue']<0.05]['index']
### Run ITA Classification Model
target_and_grouping_features=['participant_id','covid_status']
wearable_features=list(significant_features)
features_to_extract=target_and_grouping_features+wearable_features
# extract only the relevant columns from the dataframe
train_df=wearable_feature_train_df[features_to_extract].copy()
# defining target variable and estimating variables
X = train_df[wearable_features].copy() # Features
y = train_df.covid_status # Target variable
### Define model for the imbalanced dataset
classifier=LogisticRegression(class_weight='balanced', max_iter=1000)
# define parameter grid
param_grid={"C":[0.001, 0.01, 0.1, 1, 10, 100],
"penalty":['l1','l2'],
"solver":['liblinear','lbfgs']}
# get problity of each individual of getting tested positive for covid-19
y_prob=nested_cross_validate_predict_proba(X, y, classifier, param_grid)
#creating a dataframe with the true value and probability
ranking_df=pd.DataFrame(list(zip(y,y_prob)),columns = ['True_Class', 'Predicted Probability'])
# create a ranked list of participants with higher likelihood getting tested positive for covid-19
ranking_df.sort_values(by=['Predicted Probability'],inplace=True,ascending=False,ignore_index=True)
ranking_df.reset_index(inplace=True)
## setting up a scenario of limited diagnostic setting with X% available test for overall population
percentage_of_rows=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] # 10-100% of the population
data=np.zeros((len(percentage_of_rows), 3))
for i in range(0,len(percentage_of_rows)):
number_of_rows=round(len(ranking_df.index)*percentage_of_rows[i])
ranking_df_subset=ranking_df.head(number_of_rows)
data[i,0]=round(len(ranking_df.index)*percentage_of_rows[i]) # number of data points
data[i,1]=ranking_df_subset['True_Class'].value_counts()[1] # number of covid positive
data[i,2]=ranking_df_subset['True_Class'].value_counts()[0] # number of covid negative
ranking_df_range=pd.DataFrame(data=data, columns=['number_of_available_tests', 'number_of_covid_positive', 'number_of_covid_negative'],dtype=None, copy=None)
ranking_df_range['overall_number_of_covid_positive']=ranking_df['True_Class'].value_counts()[1]
ranking_df_range['overall_percentage_of_covid_positive']=100*(ranking_df_range['overall_number_of_covid_positive']/len(ranking_df.index))
ranking_df_range['positivity_rate']=100*(ranking_df_range['number_of_covid_positive']/ranking_df_range['number_of_tests'])
# create a plot with resultant positivity rate from ITA-identified subpopulation
fig = plt.figure(figsize=(6, 6))
plt.rcParams.update({'font.size': 12})
ax = fig.add_subplot(1,1,1)
sns.lineplot(data=ranking_df_range,x='number_of_tests',y='positivity_rate',legend=False,color='blue',ax=ax)
sns.lineplot(data=ranking_df_range,x='number_of_tests',y='overall_percentage_of_covid_positive',legend=False,color='red',linestyle='--',ax=ax)
ax.set_xlabel('Number of Available Diagnostic Tests')
ax.set_ylabel('Positivity Rate')
ax.set_ylim(0, 100)
plt.title('Intelligent Testing Allocation Subpopulation')
plt.savefig('Plot of Testing Allocation.jpg',format='jpg', dpi = 400)
plt.show()