-
Notifications
You must be signed in to change notification settings - Fork 10
/
Frontiers_ALR_supplementary.R
349 lines (293 loc) · 13.1 KB
/
Frontiers_ALR_supplementary.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
### Cancer data set
### read data from github site
cancer <- read.table("https://raw.githubusercontent.com/michaelgreenacre/CODAinPractice/master/Baxter_OTU_table.txt",
header=TRUE, check.names=FALSE)
dim(cancer)
# [1] 490 338
### remove first three columns to get the OTU dataset
cancer <- cancer[,-c(1:3)]
cancer.no0 <- cancer+1
# remove strong outlier, possibly an error
cancer.pro <- cancer.no0[,-265]/rowSums(cancer.no0[,-265])
### load easyCODA package
require(easyCODA)
### unweighted (i.e. equally weighted) option
starttime <- Sys.time()
cancer.alr <- FINDALR(cancer.pro, weight=FALSE)
endtime <- Sys.time()
difftime(endtime, starttime, units="secs")
# Time difference of 182.8436 secs on Toshiba latop
plot(cancer.alr$var.log,cancer.alr$procrust.cor)
cancer.alr$tot.var # [1] 1.530197
cancer.alr$procrust.max # [1] 0.9355615
cancer.alr$procrust.ref # [1] 269
cancer.alr$var.min # [1] 0.3082664
cancer.alr$var.ref # [1] 320
### weighted option
starttime <- Sys.time()
cancer.alrw <- FINDALR(cancer.pro, weight=TRUE)
endtime <- Sys.time()
difftime(endtime, starttime, units="secs")
# Time difference of 190.9042 secs on Toshiba laptop
cancer.alrw$tot.var # [1] 2.709184
cancer.alrw$procrust.max # [1] 0.9525636
cancer.alrw$procrust.ref # [1] 269
cancer.alrw$var.min # [1] 0.3082664
cancer.alrw$var.ref # [1] 320
### -----------------------------------------------------------------------------------------
### read meta data of Baxter microbiome study
meta <- read.delim("https://raw.githubusercontent.com/michaelgreenacre/CODAinPractice/master/Baxter_Metadata.txt",
header=TRUE, check.names=FALSE, sep="\t")
dim(meta)
# [1] 490 27
colnames(meta)
# [1] "sample" "fit_result" "Site" "Dx_Bin" "dx" "Hx_Prev" "Hx_of_Polyps"
# [8] "Age" "Gender" "Smoke" "Diabetic" "Hx_Fam_CRC" "Height" "Weight"
# [15] "BMI" "White" "Native" "Black" "Pacific" "Asian" "Other"
# [22] "Ethnic" "NSAID" "Abx" "Diabetes_Med" "stage" "Location"
# the group labels, also convert to numbers
dx <- meta[,"dx"]
table(dx)
# adenoma cancer normal
# 198 120 172
dx.num <- as.numeric(as.factor(dx))
table(dx.num)
# 1 2 3
# 198 120 172
### perform RDA of CLRs of the OTUS on the categorical variable dx (groups)
(cancer.rda <- rda(CLR(cancer.pro, weight=FALSE)$LR ~ factor(dx)))
# Inertia Proportion Rank
# Total 5.121e+02 1.000e+00
# Constrained 4.194e+00 8.189e-03 2 <- 0.82% of variance due to group differences
# Unconstrained 5.079e+02 9.918e-01 333 <- 99.18% unrelated to group differences
### permutation test of significance (9999 permutations)
set.seed(123)
anova(cancer.rda, permutations=9999)
# Df Variance F Pr(>F)
# Model 2 4.19 2.0106 1e-04 *** <- nevertheless, highly significant, p<0.0001
### variances explained in two-dimsneions in constrained and full spaces
100*cancer.rda$CCA$eig/cancer.rda$CCA$tot.chi
# RDA1 RDA2
# 74.62057 25.37943
100*cancer.rda$CCA$eig/cancer.rda$tot.chi
# RDA1 RDA2
# 0.6110919 0.2078403
### row coordinates in exact geometry in restricted 2-d space of group differences
cancer.rda.wa <- cancer.rda$CCA$wa
cancer.procrust.cor <- rep(0,nrow=ncol(cancer.pro))
### loop through the reference components but fit in the reduced space
starttime <- Sys.time()
for(j in 1:ncol(cancer.pro)) {
foo.alr <- ALR(cancer.pro, denom=j, weight=FALSE)$LR
foo.rda <- rda(foo.alr ~ factor(dx))
cancer.procrust.cor[j] <- protest(foo.rda$CCA$wa,cancer.rda.wa, permutations=0)$t0
}
endtime <- Sys.time()
difftime(endtime, starttime, units="secs")
# Time difference of 149.3339 secs on Toshiba laptop
max(cancer.procrust.cor)
# [1] 0.9996624
which(cancer.procrust.cor==max(cancer.procrust.cor))
# [1] 312
colnames(cancer.pro)[312]
# [1] "Otu000363"
### compute ALRs with this reference
cancer.alr312 <- ALR(cancer.pro, denom=312, weight=FALSE)$LR
(cancer.alr312.rda <- rda(cancer.alr312 ~ factor(dx)))
# Inertia Proportion Rank
# Total 6.514e+02 1.000e+00
# Constrained 4.194e+00 6.439e-03 2
# Unconstrained 6.472e+02 9.936e-01 333
cancer.alr312.wa <- cancer.alr312.rda$CCA$wa
### variances explained in constrained and full spaces
100*cancer.alr312.rda$CCA$eig/cancer.alr312.rda$CCA$tot.chi
# RDA1 RDA2
# 74.61953 25.38047
100*cancer.alr312.rda$CCA$eig/cancer.alr312.rda$tot.chi
# RDA1 RDA2
# 0.4804437 0.1634142
### plot 2-D configuration using all logratios
cancer.cols <- c("blue","red","forestgreen")
par(mar=c(4.2,4,3,1), mgp=c(2,0.7,0), font.lab=2)
plot(cancer.rda.wa, type="n", asp=1, main="Constrained LRA of OTUs",
xlab="LRA dimension 1 (74.6% / 0.61%)", ylab="LRA dimension 2 (25.4% / 0.21%)")
abline(v=0, h=0, col="gray", lty=2)
text(cancer.rda.wa, labels=substr(dx,1,1), col=cancer.cols[as.numeric(factor(dx))], cex=0.6)
set.seed(123)
CIplot_biv(cancer.rda.wa[,1],cancer.rda.wa[,2],
group=factor(dx), shade=TRUE, add=TRUE, groupcols=cancer.cols,
groupnames=c("A","C","N"))
set.seed(123)
CIplot_biv(cancer.rda.wa[,1],cancer.rda.wa[,2],
group=factor(dx), add=TRUE, groupcols=cancer.cols,
shownames=FALSE)
### plot 2-D configurations using best set of ALRs
par(mar=c(4.2,4,3,1), mgp=c(2,0.7,0), font.lab=2)
plot(cancer.alr312.wa, type="n", asp=1, main="RDA of ALRs w.r.t. 312",
xlab="RDA dimension 1 (74.6% / 0.48%)", ylab="RDA dimension 2 (25.4% / 0.16%)")
abline(v=0, h=0, col="gray", lty=2)
text(cancer.alr312.wa, labels=substr(dx,1,1), col=cancer.cols[as.numeric(factor(dx))], cex=0.6)
set.seed(123)
CIplot_biv(cancer.alr312.wa[,1],cancer.alr312.wa[,2],
group=factor(dx), shade=TRUE, add=TRUE, groupcols=cancer.cols,
groupnames=c("A","C","N"))
set.seed(123)
CIplot_biv(cancer.alr312.wa[,1],cancer.alr312.wa[,2],
group=factor(dx), add=TRUE, groupcols=cancer.cols,
shownames=FALSE)
### ----------------------------------------------------------------------------
### do all of above again with weighted components
### perform RDA of CLRs of the OTUS on the categorical variable dx (groups)
### average composition serves as default weights
c <- colMeans(cancer.pro)
(cancer.rdaw <- rda(CLR(cancer.pro)$LR%*%diag(sqrt(c)) ~ factor(dx)))
# Inertia Proportion Rank
# Total 2.714724 1.000000
# Constrained 0.017192 0.006333 2 <- 0.63% explained by group differences
# Unconstrained 2.697533 0.993667 333 <- 99.37% not related to group differences
### permutation test of significance (9999 permutations)
set.seed(123)
anova(cancer.rdaw, permutations=9999)
# Df Variance F Pr(>F)
# Model 2 0.01719 1.5519 0.0156 * <- still significant
### variances explained in constrained and full spaces
100*cancer.rdaw$CCA$eig/cancer.rdaw$CCA$tot.chi
# RDA1 RDA2
# 66.42767 33.57233
100*cancer.rdaw$CCA$eig/cancer.rdaw$tot.chi
# RDA1 RDA2
# 0.4206694 0.2126049
### row coordinates in exact geometry in restricted 2-d space of group differences
cancer.rdaw.wa <- cancer.rdaw$CCA$wa
cancer.procrustw.cor <- rep(0,nrow=ncol(cancer.pro))
### loop through the reference components but fit in the reduced space
starttime <- Sys.time()
for(j in 1:ncol(cancer.pro)) {
cc <- c*c[j]
cc <- cc[-j]
foo.alr <- ALR(cancer.pro, denom=j, weight=FALSE)$LR
foo.rda <- rda(foo.alr%*%diag(sqrt(cc)) ~ factor(dx))
cancer.procrustw.cor[j] <- protest(foo.rda$CCA$wa,cancer.rdaw.wa, permutations=0)$t0
}
endtime <- Sys.time()
difftime(endtime, starttime, units="secs")
# Time difference of 161.5026 secs on Toshiba laptop
max(cancer.procrustw.cor)
# [1] 0.9982797
which(cancer.procrustw.cor==max(cancer.procrustw.cor))
# [1] 241
colnames(cancer.pro)[241]
# [1] "Otu000262"
### compute ALRs with this reference
### compute the weights of the ALRs (later it is shown how to extract them from ALR object)
cancer.alr241 <- ALR(cancer.pro, denom=241)$LR
cc <- c*c[241]
cc <- cc[-241]
(cancer.alr241.rda <- rda(cancer.alr241 %*% diag(sqrt(cc)) ~ factor(dx)))
# Inertia Proportion Rank
# Total 1.461e-03 1.000e+00
# Constrained 6.740e-06 4.613e-03 2
# Unconstrained 1.454e-03 9.951e-01 279
cancer.alr241.wa <- cancer.alr241.rda$CCA$wa
### variances explained in constrained and full spaces
100*cancer.alr241.rda$CCA$eig/cancer.alr241.rda$CCA$tot.chi
# RDA1 RDA2
# 66.43916 33.56084
100*cancer.alr241.rda$CCA$eig/cancer.alr241.rda$tot.chi
# RDA1 RDA2
# 0.3064660 0.1548071
### plot 2-D configuration using all logratios
cancer.cols <- c("blue","red","forestgreen")
# invert second dimension to agree with previous plots
cancer.rdaw.wa[,2] <- -cancer.rdaw.wa[,2]
par(mar=c(4.2,4,3,1), mgp=c(2,0.7,0), font.lab=2)
plot(cancer.rdaw.wa, type="n", asp=1, main="Constrained weighted LRA of OTUs",
xlab="LRA dimension 1 (66.4% / 0.42%)", ylab="LRA dimension 2 (33.6% / 0.21%)")
abline(v=0, h=0, col="gray", lty=2)
text(cancer.rdaw.wa, labels=substr(dx,1,1), col=cancer.cols[as.numeric(factor(dx))], cex=0.6)
set.seed(123)
CIplot_biv(cancer.rdaw.wa[,1],cancer.rdaw.wa[,2],
group=factor(dx), shade=TRUE, add=TRUE, groupcols=cancer.cols,
groupnames=c("A","C","N"))
set.seed(123)
CIplot_biv(cancer.rdaw.wa[,1],cancer.rdaw.wa[,2],
group=factor(dx), add=TRUE, groupcols=cancer.cols,
shownames=FALSE)
### plot 2-D configurations using best set of ALRs
# invert second dimension to agree with previous plots
cancer.alr241.wa[,2] <- -cancer.alr241.wa[,2]
par(mar=c(4.2,4,3,1), mgp=c(2,0.7,0), font.lab=2)
plot(cancer.alr241.wa, type="n", asp=1, main="RDA of weighted ALRs w.r.t. 241",
xlab="RDA dimension 1 (66.4% / 0.31%)", ylab="RDA dimension 2 (33.6% / 0.15%)")
abline(v=0, h=0, col="gray", lty=2)
text(cancer.alr241.wa, labels=substr(dx,1,1), col=cancer.cols[as.numeric(factor(dx))], cex=0.6)
set.seed(123)
CIplot_biv(cancer.alr241.wa[,1],cancer.alr241.wa[,2],
group=factor(dx), shade=TRUE, add=TRUE, groupcols=cancer.cols,
groupnames=c("A","C","N"))
set.seed(123)
CIplot_biv(cancer.alr241.wa[,1],cancer.alr241.wa[,2],
group=factor(dx), add=TRUE, groupcols=cancer.cols,
shownames=FALSE)
### -----------------------------------------------------------------------------------------
### vaginal microbiome data set by Deng et al. (2018), cited and analysed by Wu et al. (2021)
### copy the data file Deng_vaginal_microbiome.txt on GitHub and read from clipboard (PC users)
vagina <- read.table("clipboard", check.names=FALSE)
### or read data from GitHub site
vagina <- read.table("https://raw.githubusercontent.com/michaelgreenacre/CODAinPractice/master/Deng_vaginal_microbiome.txt",
header=TRUE, check.names=FALSE)
vagina <- t(vagina)
dim(vagina)
# [1] 40 103
sum(vagina==0)/(nrow(vagina)*ncol(vagina)) #13% zeros
vagina1 <- vagina+1
vagina.pro <- vagina1/rowSums(vagina1)
require(easyCODA)
starttime <- Sys.time()
vagina.alr <- FINDALR(vagina.pro)
endtime <- Sys.time()
difftime(endtime, starttime, units="secs")
# Time difference of 0.7315788 secs on Toshiba.laptop
vagina.alr$totvar
# [1] 3.257595
vagina.alr$procrust.max
# [1] 0.968543
vagina.alr$procrust.ref
# [1] 51
starttime <- Sys.time()
vagina.alrw <- FINDALR(vagina.pro, weight=TRUE)
endtime <- Sys.time()
difftime(endtime, starttime, units="secs")
# Time difference of 0.8090138 secs on Toshiba laptop
vagina.alrw$tot.var
# [1] 7.710076
vagina.alrw$procrust.max
# [1] 0.9825666
vagina.alrw$procrust.ref
# [1] 12
### this illustrates getting ALR weights from function ALR
vagina.alr12 <- ALR(vagina.pro, denom=12)
vagina.alr12.LR <- vagina.alr12$LR
vagina.alr12.LRwt <- vagina.alr12$LR.wt
### exact weighted logratio geometry
vagina.lra <- LRA(vagina.pro)
vagina.lra.rpc <- vagina.lra$rowpcoord
100*vagina.lra$sv[1:2]^2/sum(vagina.lra$sv^2)
# [1] 63.39400 24.44925
par(mar=c(4.2,4,3,1), mgp=c(2,0.7,0), font.lab=2)
plot(vagina.lra.rpc, type="n", asp=1, main="LRA of vaginal microbiome",
xlab="LRA dimension 1 (63.4%)", ylab="LRA dimension 2 (24.4%)")
abline(v=0, h=0, col="gray", lty=2)
text(vagina.lra.rpc, labels=rownames(vagina), col="blue", cex=0.6)
### plot 2-D configuration using best set of ALRs
### note that weights in the ALR object are automatically used in the PCA
vagina.pca <- PCA(vagina.alr12)
vagina.pca.rpc <- vagina.pca$rowpcoord
vagina.lra.rpc <- vagina.lra$rowpcoord
100*vagina.pca$sv[1:2]^2/sum(vagina.pca$sv^2)
# [1] 76.67587 14.67200
par(mar=c(4.2,4,3,1), mgp=c(2,0.7,0), font.lab=2)
plot(vagina.pca.rpc, type="n", asp=1, main="PCA of ALRs w.r.t. 12",
xlab="PCA dimension 1 (74.7%)", ylab="RDA dimension 2 (14.7%)")
abline(v=0, h=0, col="gray", lty=2)
text(vagina.pca.rpc, labels=rownames(vagina), col="blue", cex=0.6)