5.Further validation of recovered signals.Rmd

---
title: "Further validation of the recovered signals"
author: "Bai Ling"
date: "`r format(Sys.Date(), format = '%d %B %Y')`"
output: 
  html_document:
    theme: cerulean
    css: style.css
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.align = "center")
options(stringsAsFactors = FALSE)
setwd("/Users/apple/Documents/GitHub/EWAS/")
```

## Contents
***
- [1. Down-sampling analysis](#downsample)
- [2. Increasing power to discovery true signals](#age)
- [3. Some well-known smoking-associated CpGs were recovered](#smoke)
<br>
<br>
<br>
    
#### Library R packages and basic settings    
```{r library prepare, message=FALSE, warning=FALSE}
library(tidyverse)
library(circlize)
library(ComplexHeatmap)
library(reshape2)
library(UpSetR)
library(ggplotify)
library(cowplot)
library(VennDiagram)
library(ggpubr)
library(readxl)
library(ggforce)

measure_col <- c("BH"="#D55E00", "ST"="#CC79A7", "sd.b"="#7570B3", "sd.m"="#E7298A", "mean"="#A6CEE3", 
                 "mad"="#1F78B4", "dip"="#B2DF8A", "precision"="#FFFF00", "icc.b"="#FB9A99", 
                 "icc.m"="#E31A1C", "refgene.pos"="#FDBF6F", "chr"="#A6761D", "cpg.loc"="#1B9E77", 
                 "dhs"="#949494", "direction"="#7FFFD4", "probe.type" ="#FFE4E1")
method_col <- c("CAMT" = "#FF0000", "AdaPT" = "#E69F00", "IHW" = "#56B4E9", 
                "FDRreg" = "#009E73", "BL" = "#0072B2", BH="#D55E00", ST="#CC79A7")
```

### 1. Down-sampling analysis {#downsample}
***
EWAS51 is a dataset with binary phenotype (FASD and control). We sampled each phenotype randomly at sample size 10, 20, 30, 40, 50, 60, 70, 80, 90 and 100 by method IHW and CAMT. The power of each method with different covariates are displyed as follows.  
<br>

#### 1.1 Method IHW
```{r, fig.align='center'}
selected_measure <- c("BH", "ST", "chr", "cpg.loc", "dhs", "dip", "direction", "mad", "mean",
                      "precision", "probe.type", "refgene.pos", "sd.b", "sd.m")
selected_samplesize <- c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
IHW_power <- readRDS("Data/EWAS51_IHW_downsample_power.RDS")

IHW_power <- IHW_power %>%
  filter(measure %in% selected_measure, sample_size %in% selected_samplesize)

cdata <- plyr::ddply(IHW_power, c("measure", "sample_size"), summarise,
               N    = sum(!is.na(Power)),
               mean = mean(Power, na.rm = TRUE),
               sd   = sd(Power, na.rm = TRUE),
               se   = sd / sqrt(N))

fs13 <- ggplot(data = cdata, aes(x = factor(sample_size), y = mean, colour = measure)) +
 geom_errorbar(aes(ymin=mean-se, ymax=mean+se), 
               width = 0.8, 
               position = position_dodge(width = 0.5),
               lwd = 0.5) +
  geom_point(position = position_dodge(width = 0.5), size = 0.5, shape = 16) +
  scale_color_manual(values = measure_col) +
  xlab("Sample Size") + ylab("Percentage") +
  theme_bw() +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8),
        legend.text = element_text(size = 6),
        legend.title = element_text(size = 8))

fs13
```
<br>
<br>

```{r, include=FALSE}
ggsave("Figures/FigureS13.pdf", fs13, width = 7, height = 4, useDingbats = FALSE)
```

#### 1.2 Method CAMT
```{r method CAMT, fig.align='center'}
CAMT_power <- readRDS("Data/EWAS51_CAMT_downsample_power.RDS")
CAMT_power <- CAMT_power %>%
  filter(measure %in% selected_measure, sample_size %in% selected_samplesize)

cdata <- plyr::ddply(CAMT_power, c("measure", "sample_size"), summarise,
               N    = sum(!is.na(Power)),
               mean = mean(Power, na.rm = TRUE),
               sd   = sd(Power, na.rm = TRUE),
               se   = sd / sqrt(N))

f4 <- ggplot(data = cdata, aes(x = factor(sample_size), y = mean, colour = measure)) +
 geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width = 0.8, 
               position = position_dodge(width = 0.5),
               lwd = 0.5) +
  geom_point(position = position_dodge(width = 0.5), size = 0.5, shape = 16) +
  scale_color_manual(values = measure_col) +
  xlab("Sample Size") + ylab("Percentage") +
  theme_bw() +
  theme(axis.title = element_text(size = 10),
        axis.text = element_text(size = 8),
        legend.text = element_text(size = 6),
        legend.title = element_text(size = 8))

f4
```

```{r, include=FALSE}
ggsave("Figures/Figure4.pdf", f4, width = 7, height = 4, useDingbats = FALSE)
```


### 2. Increasing power to discovery true signals {#age}
***
#### 2.1 Overview of dataset EWAS45 across different methods and covariates
```{r}
selected_measure <- c("mean", "sd.b", "mad", "icc.m", "icc.b", "precision", "dhs", "direction",
                      "chr", "refgene.pos", "dip", "probe.type", "cpg.loc", "sd.m")

# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[i]] <- readRDS(paste0("Data/EWAS45_", i, "_adjusted_pvalues.RDS"))
}

# calculate the number of significant DMPs
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))

# extract BH and ST DMPs' numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
  select(selected_measure)

plot_data <- as.matrix((plot_data - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))

col_fun = colorRamp2(c(-200, -100, 0, 200), 
                     c("royalblue", "blue", "white", "red"), 
                     space = "RGB")
panel_A <- Heatmap(plot_data,
        name = "Color Key",
        col = col_fun,
        rect_gp = gpar(col = "grey", lwd = 0.25),
        cell_fun = function(j, i, x, y, width, height, fill){
          if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 8))
        },
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        row_names_gp = gpar(fontsize = 8),
        column_names_gp = gpar(fontsize = 8),
        heatmap_legend_param = list(title_gp = gpar(fontsize = 8), 
                                    labels_gp = gpar(fontsize = 6),
                                    labels = c("-200%", "-100%", 0, "100%", "200%")))
draw(panel_A)
panel_A <- grid.grabExpr(draw(panel_A))
```
<br>
<br>

#### 2.2 Boxplot of gold standard CpGs' significance orders by covariate under different methods
```{r, message=FALSE}
# get the significance order for each cpg
ranks <- lapply(pvalues, function(x)as.data.frame(apply(x, 2, function(x)rank(x, ties.method = "random"))))
selected_method <- "IHW"

gold_standard <- read.csv("Data/GoldStandardCpG.T4.csv", header = TRUE, row.names = 1)
selected_cpg <- gold_standard$x[gold_standard$x %in% rownames(ranks[[1]])]

reference_line <- median(ranks[[selected_method]][selected_cpg, "ST"])
        
panel_B <- ranks[[selected_method]][selected_cpg, ] %>%
  select(selected_measure) %>%
  melt(value.name = "Significance_rank") %>%
  group_by(variable) %>%
  mutate(significance_rank_median = median(Significance_rank)) %>%
  ggplot(aes(x = reorder(variable, Significance_rank, FUN = median), y = Significance_rank, color = variable, fill = variable)) +
  geom_sina(size = 0.25, alpha = 0.2, shape = 16) +
  geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
  geom_segment(aes(x = variable, xend = variable,
                   y = reference_line, yend = significance_rank_median),
               size = 0.25, color = "black") +
  stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
  scale_fill_manual(values = measure_col) +
  scale_color_manual(values = measure_col) +
  scale_x_discrete(limits = selected_measure) +
  xlab("") + ylab("Significance Rank") + labs(fill = "") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none")
panel_B

selected_method <- c("AdaPT", "BL", "CAMT", "FDRreg")
fs14 <- as.data.frame(ranks[selected_method])[selected_cpg, ] %>%
  melt(value.name = "Significance_rank") %>%
  mutate(method = sapply(str_split(variable, fixed("."), n = 2), function(x)unlist(x)[1]),
         measure = sapply(str_split(variable, fixed("."), n = 2), function(x)unlist(x)[2])) %>%
  filter(measure %in% selected_measure) %>%
  select(Significance_rank, method, measure) %>%
  group_by(method, measure) %>%
  mutate(significance_rank_median = median(Significance_rank)) %>%
  ggplot(aes(x = reorder(measure, Significance_rank, FUN = median), y = Significance_rank, color = measure, fill = measure)) +
  geom_sina(size = 0.25, alpha = 0.2, shape = 16, show.legend = FALSE) +
  geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
  geom_segment(aes(x = measure, xend = measure,
                   y = reference_line, yend = significance_rank_median),
               size = 0.25, color = "black") +
  stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
  scale_fill_manual(values = measure_col) +
  scale_color_manual(values = measure_col) +
  scale_x_discrete(limits = selected_measure) +
  facet_wrap(~ method, nrow = 2) +
  xlab("") + ylab("Significance Rank") + labs(fill = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_text(size = 8),
        axis.title.y = element_text(size = 10))

fs14
```
<br>
<br>

```{r, include=FALSE}
save_plot("Figures/FigureS14.pdf", fs14, base_width = 6.69, base_height = 5)
```

#### 2.3 Boxplot of gold standard CpGs' significance orders by different methods
```{r, message=FALSE}
# choose the most powerful covariate mean
subset_rank <- as.data.frame(lapply(ranks, function(x)x[, "mean", drop = FALSE]))
colnames(subset_rank) <- names(ranks)

selected_method <- c("CAMT", "AdaPT", "IHW", "BL", "FDRreg")
panel_C <- subset_rank[selected_cpg, ] %>%
  select(selected_method) %>%
  melt(value.name = "Significance_rank") %>%
  ggplot(aes(x = variable, y = Significance_rank)) +
  geom_boxplot(aes(fill = variable), width = 0.72, fatten = 1) +
  scale_fill_manual(values = method_col) + 
  scale_x_discrete(limits = selected_method) +
  xlab("") + ylab("Significance Rank") + labs(fill = "method") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 30, hjust = 0.5, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none") +
  geom_hline(yintercept = reference_line, linetype = "dashed", color = "black", size = 0.5)

panel_C
```
<br>
<br>
```{r, fig.width=7, fig.height=8}
# organize figures into Figure 5
bottom_row <- plot_grid(panel_B, panel_C, nrow = 1, align = "h", 
                        labels = c("B", "C"), label_size = 18)
f5 <- plot_grid(panel_A, bottom_row, ncol = 1,
                labels = c("A", ""), label_size = 18)
f5
```

```{r, include=FALSE}
save_plot("Figures/Figure5.pdf", plot = f5, base_width = 6, base_height = 6)
```

#### 2.4 Validation in another data set EWAS27
```{r, message=FALSE}
selected_measure <- c("mad", "mean", "sd.b", "precision", "sd.m", "dhs", "dip", "chr",  "direction", "probe.type", "refgene.pos", "cpg.loc")

# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[i]] <- readRDS(paste0("Data/EWAS27_", i, "_adjusted_pvalues.RDS"))
}

## heatmap
discovery <- as.data.frame(lapply(pvalues, function(x)(apply(x, 2, function(x)sum(x < 0.05)))))
discovery <- as.data.frame(t(discovery))

# extract BH and ST signal numbers
BH <- unique(discovery$BH)
ST <- unique(discovery$ST)
plot_data <- discovery %>%
  select(selected_measure)

plot_data <- as.matrix((plot_data - ST)/ST)*100
cell_lable <- paste0(round(plot_data, 0), "%")
cell_lable <- matrix(cell_lable, nrow = nrow(plot_data), ncol = ncol(plot_data))
col_fun = colorRamp2(c(-100, 0, 200), c("blue", "white", "red"), space = "RGB")
fs15_1 <- Heatmap(plot_data,
        name = "Color Key",
        col = col_fun,
        rect_gp = gpar(col = "grey", lwd = 0.25),
        cell_fun = function(j, i, x, y, width, height, fill){
          if(abs(plot_data[i, j]) > 1) grid.text(cell_lable[i, j], x, y, gp = gpar(fontsize = 8))
        },
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        clustering_method_rows = "ward.D",
        clustering_method_columns = "ward.D",
        row_names_gp = gpar(fontsize = 8),
        column_names_gp = gpar(fontsize = 8),
        heatmap_legend_param = list(labels_gp = gpar(fontsize = 6),
                                    title_gp = gpar(fontsize = 8),
                                    at = c(-100, 0, 100, 200),
                                    labels = c("-100%", 0, "100%", "200%")))
fs15_1 <- grid.grabExpr(draw(fs15_1))

## boxplot
ranks <- lapply(pvalues, function(x)as.data.frame(apply(x, 2, function(x)rank(x, ties.method = "average"))))

selected_method <- "IHW"
gold_standard <- read.csv("Data/GoldStandardCpG.T4.csv", 
                          header = TRUE, row.names = 1)
selected_cpg <- gold_standard$x[gold_standard$x %in% rownames(ranks[[1]])]

reference_line <- median(ranks[[selected_method]][selected_cpg, "ST"])

fs15_2 <- ranks[[selected_method]][selected_cpg, ] %>%
  select(selected_measure) %>%
  melt(value.name = "Significance_rank") %>%
  group_by(variable) %>%
  mutate(significance_rank_median = median(Significance_rank)) %>%
  ggplot(aes(x = reorder(variable, Significance_rank, FUN = median), y = Significance_rank, color = variable, fill = variable)) +
  geom_sina(size = 0.25, alpha = 0.2, shape = 16) +
  geom_hline(aes(yintercept = reference_line), color = "black", size = 0.5, linetype = "dashed") +
  geom_segment(aes(x = variable, xend = variable,
                   y = reference_line, yend = significance_rank_median),
               size = 0.25, color = "black") +
  stat_summary(fun.y = median, geom = "point", size = 1.5, shape = 21, color = "black") +
  scale_fill_manual(values = measure_col) +
  scale_color_manual(values = measure_col) +
  scale_x_discrete(limits = selected_measure) +
  xlab("") + ylab("Significance Rank") + labs(fill = "") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none")

subset_rank <- as.data.frame(lapply(ranks, function(x)x[, "mean", drop = FALSE]))
colnames(subset_rank) <- names(ranks)

selected_method <- c("CAMT", "AdaPT", "IHW", "BL", "FDRreg")
fs15_3 <- subset_rank[selected_cpg, ] %>%
  select(selected_method) %>%
  melt(value.name = "Significance_rank") %>%
  ggplot(aes(x = variable, y = Significance_rank)) +
  geom_boxplot(aes(fill = variable), width = 0.72, fatten = 1) +
  scale_fill_manual(values = method_col) + 
  scale_x_discrete(limits = selected_method) +
  xlab("") + ylab("Significance Rank") + labs(fill = "method") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 30, hjust = 0.5, vjust = 0.5, size = 8),
        axis.title = element_text(size = 10),
        legend.position = "none") +
  geom_hline(yintercept = reference_line, linetype = "dashed", color = "black", size = 0.5)
```

```{r, fig.width=7, fig.height=6}
## put all figures together
bottom_row <- plot_grid(fs15_2, fs15_3, labels = c("B", "C"), align = "h", nrow = 1, label_size = 18)
fs15 <- plot_grid(fs15_1, bottom_row, labels = c("A", ""), ncol = 1, label_size = 18)
fs15
```

```{r, include=FALSE}
save_plot("Figures/FigureS15.pdf", plot = fs15, base_width = 6, base_height = 6)
```


### 3. Some well-known smoking-associated CpGs were recovered {#smoke}
***
#### 3.1 Compare covariate-adaptive FDR methods' result with reported smoking-related CpGs
```{r overview of results}
selected_measure <- c("mean", "sd.b", "mad", "precision", "dhs", "chr", "refgene.pos",
                      "dip", "probe.type", "cpg.loc", "sd.m")

# read in adjusted p-values' data into a list
pvalues <- list()
for(i in c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")){
  pvalues[[i]] <- readRDS(paste0("Data/EWAS20_", i, "_adjusted_pvalues.RDS"))
}

raw_probes <- rownames(pvalues[[1]])
sig_probes <- lapply(pvalues, function(x)apply(x, 2, function(x)raw_probes[x < 0.05]))
```
<br>
<br>

##### 3.1.1 Method IHW
```{r}
result <- sig_probes[["IHW"]]

# compare with the results of GSE50660-originated paper
paper_results <- read_xlsx("Data/Known smoking-associated CpGs.xlsx", 
                           sheet = "GSE50660")

overlap <- lapply(result, function(x)intersect(x, paper_results$CpGs))
reccurence <- as.data.frame(lapply(overlap, function(x)sprintf("%s/%s", length(x), length(paper_results$CpGs))))

(reccurence <- reccurence[, c("BH", "ST", selected_measure)])

# extra findings by covariates
identical(result[["BH"]], result[["ST"]])
BH <- result[["BH"]]
result$BH <- NULL
result$ST <- NULL
extra_findings <- lapply(result, function(x)setdiff(setdiff(x, BH), paper_results$CpGs))
(extra_findings <- extra_findings[selected_measure])

# see if the extra findings are in the list of other two papers
race_results <- read_xlsx("Data/Known smoking-associated CpGs.xlsx", 
                          sheet = "different_population")
(overlap_1 <- lapply(extra_findings, function(x)intersect(x, race_results$CpGs)))
multiple_results <- read_xlsx("Data/Known smoking-associated CpGs.xlsx",
                              sheet = "multiple_reported")
(overlap_2 <- lapply(extra_findings, function(x)intersect(x, multiple_results$CpGs)))


IHW <- unique(unlist(extra_findings)) # save for later use
```
<br>
<br>

##### 3.1.2 Method CAMT
```{r}
result <- sig_probes[["CAMT"]]

# compare with the results of GSE50660-originated paper
overlap <- lapply(result, function(x)intersect(x, paper_results$CpGs))
reccurence <- as.data.frame(lapply(overlap, function(x)sprintf("%s/%s", length(x), length(paper_results$CpGs))))
(reccurence <- reccurence[, c("BH", "ST", selected_measure)])

# extra findings by covariates
identical(result[["BH"]], result[["ST"]])
BH <- result[["BH"]]
result$BH <- NULL
result$ST <- NULL
extra_findings <- lapply(result, function(x)setdiff(setdiff(x, BH), paper_results$CpGs))
(extra_findings <- extra_findings[selected_measure])

# see if the extra findings are in the list of other two papers
(overlap_1 <- lapply(extra_findings, function(x)intersect(x, race_results$CpGs)))
(overlap_2 <- lapply(extra_findings, function(x)intersect(x, multiple_results$CpGs)))
CAMT <- unique(unlist(extra_findings)) # save for later use
```
<br>
<br>

#### 3.2 Compare the methylation level of cg18092474 between groups
```{r}
methy_data <- readRDS("Data/cg18092474.RDS")

# count samples within each group
table(methy_data$group)
```
<br>

```{r}
fs16 <- methy_data %>%
  ggplot(aes(x = group, y = bVals, fill = group)) + 
  stat_boxplot(geom = "errorbar", width = 0.3) + 
  geom_boxplot(width = 0.6) + 
  scale_x_discrete(labels = c("current smoker (n=22)", "former smoker (n=263)", "never (n=179)")) +
  annotate("text", label = "BH: 0.090       ST: 0.088\nIHW: 0.021       CAMT: 0.019",
           x = 2, y = 0.9:1, color = "red") +
  geom_segment(aes(x = 1, y = 0.9, xend = 3, yend = 0.9), size = 0.25) +
  geom_segment(aes(x = 1, y = 0.8, xend = 1, yend = 0.9), size = 0.1) +
  geom_segment(aes(x = 3, y = 0.85, xend = 3, yend = 0.9), size = 0.1) +
  ylim(0, 1) +
  xlab("Group") +
  theme_bw() + 
  theme(legend.position = "none") 
fs16
```
<br>
<br>


```{r, include=FALSE}
save_plot("Figures/FigureS16.pdf", plot = fs16, base_width = 5, base_height = 6)
```

#### 3.3 Intersetion of method IHW and CAMT
```{r venn plot, fig.width=4, fig.height=3}
fs17 <- draw.pairwise.venn(area1 = length(IHW),
                           area2 = length(CAMT),
                           cross.area = length(intersect(IHW, CAMT)),
                           category = c("IHW", "CAMT"),
                           fill = c("pink", "skyblue"),
                           cat.pos = c(0, 0),
                           scaled = FALSE)
```
<br>
<br>

```{r, include=FALSE}
save_plot("Figures/FigureS17.pdf", fs17)
```