-
Notifications
You must be signed in to change notification settings - Fork 0
/
beatAML_pred.R
116 lines (92 loc) · 4.1 KB
/
beatAML_pred.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
library(readr)
library(dplyr)
library(reshape2)
library(S4Vectors)
library(tibble)
library(caret)
library(pls)
library(xgboost)
input_dir = "/input/"
output_dir = "/output/"
rds_dir = "/rds/"
out_file=paste0(output_dir,"predictions.csv")
dnaseq_t <- read_csv(paste0(input_dir,"dnaseq.csv"))
rnaseq_t <- read_csv(paste0(input_dir,"rnaseq.csv"))
clinical_categorical_t <- read_csv(paste0(input_dir,"clinical_categorical.csv"))
clinical_numerical_t <- read_csv(paste0(input_dir,"clinical_numerical.csv"))
clinical_categorical_legend_t <- read_csv(paste0(input_dir,"clinical_categorical_legend.csv"))
colnames(clinical_categorical_t) <- make.names(colnames(clinical_categorical_t))
# drop columns with missing values
clinical_numerical_t <- clinical_numerical_t %>% select(-c("%.Blasts.in.PB","WBC.Count"))
# prepare rnaseq data
rna_log2counts_t <- as.matrix(rnaseq_t[,3:dim(rnaseq_t)[2]])
rownames(rna_log2counts_t) <- rnaseq_t$Gene
rna_counts_t <- round(2^rna_log2counts_t)
rownames(rna_counts_t) <- rownames(rna_log2counts_t)
# rnaseq cpm sum of total check
print(colSums(2^rna_log2counts_t))
# variance stabilizing transformation
#vstcounts_t <- vst(rna_counts_t, blind = FALSE)
#rownames(vstcounts_t) <- rownames(rna_log2counts_t)
#rna_log2counts_t <- vstcounts_t
# load lists of model fits and gene features
model.fits <- readRDS(file = paste0(rds_dir,"model-fits.rds"))
dnaseqGenes <- readRDS(file = paste0(rds_dir,"dnaseqGenes.rds"))
rnaseqGenes <- readRDS(file = paste0(rds_dir,"rnaseqGenes.rds"))
rna.preProc <- readRDS(file = paste0(rds_dir,"rna-preProc.rds"))
output_df <- c()
for (drug_t in names(model.fits)){
my_model <- model.fits[[drug_t]]
topGenes_dnaseq_t <- dnaseqGenes[[drug_t]]
topGenes_rnaseq_t <- rnaseqGenes[[drug_t]]
sigDE_preProc_t <- rna.preProc[[drug_t]]
sigDE_log2counts_t <- rna_log2counts_t[
which(rownames(rna_log2counts_t) %in% topGenes_rnaseq_t),]
if (any(rownames(varImp(my_model)$importance) %in% "V1")) {
Ncomp <- 1} else { Ncomp <-10}
print(Ncomp)
PLS_sigDE_t <- c()
print(drug_t)
if (Ncomp!=0){
PLS_sigDE_t <- predict(sigDE_preProc_t, comps = 1:Ncomp,
type = "scores", newdata = t(sigDE_log2counts_t))
}
full_data_t <- dcast(
dnaseq_t, lab_id ~ Hugo_Symbol, fun.aggregate =
function(x)(as.numeric(!isEmpty(x))) ) %>% select(-c("NPM1"))
missingGenos = setdiff(colnames(rna_log2counts_t),full_data_t$lab_id)
mtx <- cbind(missingGenos,matrix(0,length(missingGenos),dim(full_data_t)[2]-1))
colnames(mtx) <- c("lab_id",colnames(full_data_t)[-1])
full_data_t = rbind(full_data_t, mtx)
full_data_t <- full_data_t %>%
inner_join(
select(clinical_categorical_t,c("lab_id","NPM1","FLT3.ITD")),
by = "lab_id") %>%
select(one_of(c("lab_id",topGenes_dnaseq_t))) %>%
inner_join(
select(clinical_categorical_t,-c("NPM1","FLT3.ITD")) %>%
mutate_at(vars(-one_of("lab_id")),as.numeric), by = "lab_id") %>%
inner_join(clinical_numerical_t, by = "lab_id")
full_data_t <- full_data_t %>%
inner_join( as.data.frame(PLS_sigDE_t) %>%
mutate(lab_id=rownames(PLS_sigDE_t)), by = "lab_id")
full_data_t[setdiff(topGenes_dnaseq_t,names(full_data_t))] <- 0
full_data_t[setdiff(rownames(varImp(my_model)$importance),names(full_data_t))] <- 0
full_data_t <- full_data_t %>%
mutate_at(vars(-one_of("lab_id")),funs(as.numeric(as.character(.))))
rownames(full_data_t)=full_data_t$lab_id
full_data_t <- full_data_t %>% select(-c("lab_id"))
# mean impute missing values as only a couple NAs
cM <- colMeans(full_data_t, na.rm=TRUE)
indx <- which(is.na(full_data_t), arr.ind=TRUE)
if (!isEmpty(indx)){ full_data_t[indx] <- cM[indx[,2]]}
#full_data_t <- full_data_t[, my_model$feature_names]
#full_data_t_model <- model.matrix(~ .-1, full_data_t)
pred_t <- predict(my_model, newdata = full_data_t) #newdata = full_data_t_model)
aucs_pred <- data.frame(
lab_id = rownames(full_data_t), #rownames(full_data_t_model),
inhibitor = rep(drug_t,length(pred_t)),
auc = pred_t, row.names=NULL)
output_df = rbind(output_df, aucs_pred)
}
write_csv(output_df, out_file)