-
Notifications
You must be signed in to change notification settings - Fork 0
/
Script1-ReadLengthVsGenomeQuality.R
129 lines (94 loc) · 4.13 KB
/
Script1-ReadLengthVsGenomeQuality.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#Script1: Impact of read length on genome quality
#Parts
#1.1: Impact of read length on %BUSCO complete
#1.2: Impact of read length on contigN50
#1.3: Impact of read length on scaffold count
#1.4: Impact of read length on repeat classification
#1.5: Impact of read length on percentage of genome as chromosomes
#-------------------------PREPARE-ENVIRONMENT---------------------------------------#
#Open libraries
library("tidyverse")
library("ggplot2")
library("ggpmisc")
library("ggpubr")
#Set working directory
setwd("/Users/tkosch/OneDrive - The University of Melbourne/Documents/UoM/Projects/Amphib-Comparative-Genomics/R_analyses/")
#-------------------------INPUT&FORMATTING------------------------------------------#
#Input data
genomes <- read.csv("bbmap-busco6.csv", row.names = 1, , header = TRUE)
#Adjust to not use scientific notation in plots
options(scipen=999)
#Pull relevant columns
genomes_sub <- genomes[ , c("n_scaffolds", "scaf_bp", "ctg_N50" , "readlength", "chr.assembly",
"per_busco_complete", "unclassified_repeats", "classified_repeats",
"per.gen.chrs")]
#Exclude NA rows
genomes_sub_noNA <- genomes_sub[complete.cases(genomes_sub), ]
#Create new cols with n_scaffolds converted to Kb and Mb
genomes_sub_noNA$n_scaffolds_kb <- genomes_sub_noNA$n_scaffolds / 1000
genomes_sub_noNA$n_scaffolds_Mb <- genomes_sub_noNA$n_scaffolds / 1000000
genomes_sub_noNA$scaf_bp_Gb <- genomes_sub_noNA$scaf_bp / 1000000000
genomes_sub_noNA$ctg_N50_Kb <- genomes_sub_noNA$ctg_N50 / 1000
#Log transform
genomes_sub_noNA$ctg_N50_Kb_log10 <- log10(genomes_sub_noNA$ctg_N50_Kb)
genomes_sub_noNA$n_scaffolds_kb_log10 <- log10(genomes_sub_noNA$n_scaffolds_kb)
#-------------------------PART1.1---------------------------------------------------#
#1.1: Impact of read length on %BUSCO complete
#Boxplot
ggplot(genomes_sub_noNA, aes(x=readlength, y=per_busco_complete)) +
geom_boxplot() +
theme_classic() +
ylab ("BUSCO complete") +
xlab("Read type")
#Is variance equal? NO
#T-test
t.test(per_busco_complete ~ readlength, var.equal = FALSE, data = genomes_sub_noNA)
#t = 2.8544, df = 13.448, p-value = 0.01316
#-------------------------PART1.2---------------------------------------------------#
#1.2: Impact of read length on contigN50
#Boxplot
ggplot(genomes_sub_noNA, aes(x=readlength, y=ctg_N50_Kb_log10)) +
geom_boxplot() +
theme_classic() +
ylab ("Congtig N50") +
xlab("Read type")
#Is variance equal? YES
#T-test
t.test(ctg_N50_Kb_log10 ~ readlength, var.equal = TRUE, data = genomes_sub_noNA)
#t = 6.0471, df = 29, p-value = 0.0000014
#-------------------------PART1.3---------------------------------------------------#
#1.3: Impact of read length on scaffold count
#Boxplot
ggplot(genomes_sub_noNA, aes(x=readlength, y=n_scaffolds_kb_log10)) +
geom_boxplot() +
theme_classic() +
ylab (expression(Log[10]~scaffold~count)) +
xlab("Read type")
#Is variance equal? YES
#T-test
t.test(n_scaffolds_kb_log10 ~ readlength, var.equal = TRUE, data = genomes_sub_noNA)
#t = -4.8559, df = 29, p-value = 0.00003786
#-------------------------PART1.4---------------------------------------------------#
#1.4: Impact of read length on repeat classification
#Box plot
ggplot(genomes_sub_noNA, aes(x=readlength, y=classified_repeats)) +
geom_boxplot() +
theme_classic() +
ylab ("% classfied repeats") +
xlab("Read type")
#Is variance equal? NO
t.test(classified_repeats ~ readlength, var.equal = FALSE, data = genomes_sub_noNA)
#t = 3.5735, df = 26.241, p-value = 0.001393
#-------------------------PART1.5---------------------------------------------------#
#1.5: Impact of read length on percentage of genome as chromosomes
#Exclude genomes not assembled to chromosome level
genomes_sub_noNA_noZero <- genomes_sub_noNA[!grepl("no",genomes_sub_noNA$chr.assembly),]
#Box plot
ggplot(genomes_sub_noNA, aes(x=readlength, y=per.gen.chrs)) +
geom_boxplot() +
theme_classic() +
ylab ("% assembled to chromosomes") +
xlab("Read type")
#Is variance equal? NO
t.test(per.gen.chrs ~ readlength, var.equal = FALSE, data = genomes_sub_noNA)
#t = 3.6493, df = 28.484, p-value = 0.001047