forked from GreenleafLab/scScalpChromatin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
misc_helpers.R
396 lines (313 loc) · 13.9 KB
/
misc_helpers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
#####################################
# Various helper functions in R
#####################################
suppressPackageStartupMessages({
library(dplyr)
library(Matrix)
library(tidyr)
library(magrittr)
library(Rcpp)
library(RcppAlgos)
})
#####################################
# Miscellaneous
#####################################
`%ni%` <- Negate(`%in%`)
makeBins <- function(x, breaks=NULL, bins=10, bin.type="width"){
# Return vector of labels for bins made from x
########################
# x = numeric vector
# breaks = manually set break values (signpost style, i.e. n+1 breaks for n bins)
# bins = number of bins to return
# bin.type = type of bins ('width' = equal width bins, 'size' = bins contain same number of elements)
if(is.null(breaks)){
stopifnot(bin.type %in% c('width', 'size'))
if(bin.type == 'width') breaks <- seq(min(x), max(x), length.out=bins+1)
if(bin.type == 'size') breaks <- quantile(x, probs=c(0,1/bins*c(1:bins)))
}
ids <- as.numeric(cut(x, breaks=breaks, include.lowest=TRUE))
return(list(ids=ids, breaks=breaks))
}
invertList <- function(lst){
# Swap names and values in list
split(rep(names(lst), lengths(lst)), unlist(lst))
}
confusionMatrix <- function(i = NULL, j = NULL){
ui <- unique(i)
uj <- unique(j)
m <- Matrix::sparseMatrix(
i = match(i, ui),
j = match(j, uj),
x = rep(1, length(i)),
dims = c(length(ui), length(uj))
)
rownames(m) <- ui
colnames(m) <- uj
m
}
getFreqs <- function(x){
# Return a named vector of frequencies of x
tab <- table(x) %>% as.data.frame.table()
frqs <- tab$Freq
names(frqs) <- tab[,1]
frqs[order(frqs, decreasing=TRUE)]
}
jaccardIndex <- function(mat, i, j){
# Calculate Jaccard Index between row i and column j in matrix mat
# (Matrix is an intersection matrix of categories in rows i and columns j)
AiB <- mat[i,j]
AuB <- sum(mat[i,]) + sum(mat[,j]) - AiB
AiB/AuB
}
#####################################
# Misc Genomics Tools
#####################################
convertMouseGeneList <- function(genes){
# Function to convert mouse to human gene names
# see: https://www.r-bloggers.com/converting-mouse-to-human-gene-names-with-biomart-package/
require("biomaRt")
human <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host="https://dec2021.archive.ensembl.org/")
mouse <- useMart("ensembl", dataset="mmusculus_gene_ensembl", host="https://dec2021.archive.ensembl.org/")
genesV2 <- getLDS(attributes=c("mgi_symbol"), filters="mgi_symbol", values=genes,
mart=mouse, attributesL=c("hgnc_symbol"), martL=human, uniqueRows=TRUE)
#humanx <- unique(genesV2[, 2])
#return(humanx)
return(genesV2)
}
gcContent <- function(gr, genome){
# Calculate GC content for a provided genomic range and a corresponding BSgenome
##########################################################################
# gr = genomic range
# genome = BSgenome object
freqs <- Biostrings::alphabetFrequency(Biostrings::getSeq(genome, gr))[,1:4]
gc <- apply(freqs, 1, function(x){sum(x[c("C","G")])/sum(x)})
gc
}
trim_N_seqs <- function(gr, genome, thresh=0){
# Identify and remove ranges that contain any N values
##########################################################################
# gr = genomic range
# genome = BSgenome object
# thresh = fraction of seq allowed to contain N's (e.g. 0.2 = 20% N's is acceptable)
N_freq <- Biostrings::alphabetFrequency(Biostrings::getSeq(genome, gr))[,15] # 15th column contains NA counts
N_frac <- N_freq / width(gr)
gr[N_frac <= thresh]
}
trim_oob <- function(gr){
# Identify and remove ranges that contain 'out of bounds' regions
# (gr must contain seqinfo!)
##########################################################################
# gr = genomic range
idx <- GenomicRanges:::get_out_of_bound_index(gr)
if(length(idx) != 0L){
gr <- gr[-idx]
}
return(gr)
}
#####################################
# Sampling cells
#####################################
numCellsPerGroup <- function(groups, total.cells=NULL, cells.per.group=NULL, min.cells.per.group=50, warn.only=TRUE){
# Number of cells to downsample from each group
# Returns a table of number of cells to downsample from each group
# Adapted from https://github.com/cssmillie/ulcerative_colitis/blob/master/downsample.r
######################
# groups = vector of group membership of each cell barcode
# total.cells = total number of cells to downsample to
# cells.per.group = number of cells per group to downsample
if(is.null(total.cells) & is.null(cells.per.group)){
stop("total.cells and cells.per.group cannot both be NULL!")
}
ncells <- table(groups) %>% sort() # Sorted low to high
# Check if any groups are 'too small'
if(any(ncells < min.cells.per.group)){
smallGroups <- paste(names(ncells[ncells < min.cells.per.group]), collapse=';')
if(warn.only){
message(sprintf("Warning: groups %s had fewer than %s cells", smallGroups, min.cells.per.group))
}else{
stop(sprintf("Error: groups %s had fewer than %s cells. Exiting...", smallGroups, min.cells.per.group))
}
}
# if sampling w/ cells.per.group, enforce thresholds
if(!is.null(cells.per.group)){
ncells[ncells > cells.per.group] <- cells.per.group
}else{
n <- table(groups) %>% sort()
if(length(ncells) == 1){
# In the case of one group, take total.cells from that group
ncells <- total.cells
names(ncells) <- names(n)
}else{
# For multiple groups, determine how many of each group to take
# First: get cumulative totals of cells from 0 to the largest group
u <- c(0, cumsum(n)[1:length(n)-1])
# Second: Try to evenly split cells between groups.
# If the smallest group isn't large enough, split
# subsequent groups into larger groups, appending the missing
# difference, and so on
i <- (total.cells - u) / seq(length(n), 1, -1) < n
# If we have more cells than requested, split:
if(sum(i) > 0){
ncells[i] <- as.integer(ceiling((total.cells - sum(n[!i]))/sum(i)))
}
}
}
ncells
}
downsampleByGroup <- function(cells, groups, ngene=NULL, total.cells=NULL, cells.per.group=NULL, replace=FALSE){
# Downsample cells evenly across groups
# Adapted from https://github.com/cssmillie/ulcerative_colitis/blob/master/downsample.r
#####################
# cells = vector of cell names
# groups = vector of cell groups
# ngene = vector of number of genes detected per cell (if provided, will prioritize cells with more genes)
# total.cells = total number of cells to downsample
# cells.per.group = how many cells per group (cannot use with total.cells)
# repalce = should cells be sampled with replacement
if(is.null(total.cells) & is.null(cells.per.group)){
stop("total.cells and cells.per.group cannot both be NULL!")
}
# Set ngene to ones if not provided
if(is.null(ngene)){
ngene <- structure(rep(1, length(cells)), names=cells)
}
if(is.null(names(ngene))){
names(ngene) <- cells
}
# Calculate group sizes to sample
groups <- as.factor(groups)
cellsPerGroup <- numCellsPerGroup(groups, total.cells=total.cells, cells.per.group=cells.per.group)
# Downsample cells within group
downSampled <- sapply(levels(groups), function(x){
# Shuffle cells within group
gcells <- cells[groups == x]
gcells <- sample(gcells, size=length(gcells), replace=FALSE)
# Subsample, prioritizing high gene counts (random sample if ngene not provided)
gcells[order(ngene[gcells], decreasing=TRUE)[1:cellsPerGroup[[x]]]]
})
# Combine and return
downSampled <- unlist(downSampled) %>% na.omit() %>% as.character()
return(downSampled)
}
selectCells <- function(covars, groupby=NULL, ngene=NULL, cells.use=NULL, max.cells=NULL, replace=FALSE){
# Select cells from groups defined by covariates matrix
# Adapted from https://github.com/cssmillie/ulcerative_colitis/blob/master/markers.r
#######################
# covars = data.frame of cells x covariates
# groupby = covariate groups to sample by (i.e. vector of column names to sample evenly across)
# cells.use = vector of cells allowed to be used
# max.cells = maximum number of cells to subset from each group
# repalce = should cells be sampled with replacement
# Cells to use:
cells <- rownames(covars)
if(!is.null(cells.use)){
cells <- intersect(cells, cells.use)
}
# Select cells evenly across groups (if we are subsampling cells)
if(!is.null(max.cells)){
# Prepare cell groups (i.e. define all unique combinations of groupby covariates)
if(is.null(groupby)){
# If groupby is not provided, will use all non-numeric covariates
groupby <- sapply(covars, function(x) !is.numeric(x))
}
groups <- as.factor(apply(covars[cells, groupby, drop=FALSE], 1, function(x) paste(x, collapse='.')))
names(groups) <- cells
# Perform downsampling (prioritizes cells with more genes if ngene is provided)
cells <- downsampleByGroup(cells, groups, ngene=ngene, total.cells=max.cells, replace=replace)
# Print how many cells in each group:
print(table(groups[cells]))
}
cells
}
#####################################
# GWAS and related tools
#####################################
parseVCF <- function(vcf_file, snpGR){
# Read in a previously generated bcftools vcf file
# The header of the VCF file has lines denoted by '##'. fread simply reads through these somehow.
# The meat of the vcf format is a tab delimited text file with the following columns:
# 1: Chromosome name
# 2: 1-based position on the chromosome
# 3. ID - identifier where available
# 4: Reference base at this position
# 5: Alternate base(s) at this position
# 6: quality (phred-scaled quality score for assertion made in alt)
# 7: filter status
# 8: Info - additional information
# 9: format - indicates the format of coming sample columns
# 10+ : individual sample columns
#######################################################################
# Arguments provided:
# vcf_file = previously generated vcf file
# snpGR = snp genomic range
require(vcfR)
vcf <- read.vcfR(vcf_file, verbose=TRUE)
vcf.fix <- as.data.frame(vcf@fix)
chrs <- vcf.fix$CHROM
positions <- as.integer(vcf.fix$POS)
vcfGR <- GRanges(seqnames=chrs, IRanges(positions, end=positions))
vcfGR$ref <- vcf.fix$REF
vcfGR$alt <- vcf.fix$ALT
vcfGR$linked_SNP <- snpGR$linked_SNP[findOverlaps(vcfGR, snpGR, maxgap=-1L, select="first", type="equal", ignore.strand=TRUE)]
vcfGR$linked_refalt <- snpGR$linked_refalt[findOverlaps(vcfGR, snpGR, maxgap=-1L, select="first", type="equal", ignore.strand=TRUE)]
# Add genotype data to GR
gtdf <- as.data.table(vcf@gt)
gtdf <- gtdf[,2:ncol(gtdf)] # drop format column that we don't care about
# Rename samples to match expected sample names
colnames(gtdf) <- sapply(colnames(gtdf), function(x){
strsplit(x, split="_")[[1]] %>% head(.,-1) %>% paste(.,collapse="_")
})
# Remove other information we don't want
gtdf <- apply(gtdf, c(1,2), function(x) strsplit(x, split=":")[[1]][1])
# A '.' indicates where a sample was unable to be genotyped
gtdf <- apply(gtdf, c(1,2), function(x) replace(x, grep("[.]", x), NA))
gtdf
values(vcfGR) <- cbind(values(vcfGR), gtdf)
vcfGR
}
#####################################
# Milo differential accessibility
#####################################
runMilo <- function(obj, dim_reduction, k=30, prop=0.1, min_cells_per_samp=50, contrast="diseaseStatus", sample_name="Sample"){
# Run all the necessary steps of Milo KNN graph-based differential abundance testing
#####################################################
# obj = A clustered seurat object
# dim_reduction = the name of the dim_reduction to use for Milo
# k = the number of neighbors for building KNN graph (30 is default)
# prop = Sampling proportion for building neighborhoods (0.1 is recommended for dataselts < 30k cells)
# min_cells_per_samp = minimum number of cells present in a given sample to use for Milo
require(miloR)
require(SingleCellExperiment)
# Subset seurat object to remove samples that are too lowly represented:
samp_freqs <- getFreqs(obj$Sample)
valid_samps <- samp_freqs[samp_freqs > min_cells_per_samp] %>% names()
Idents(rna_proj) <- "Sample"
sub_obj <- subset(rna_proj, idents = c(valid_samps))
# Create Milo object
obj_milo <- Milo(as.SingleCellExperiment(sub_obj))
# build KNN graph using LSI reduced dimensions
d <- length(obj@reductions[[dim_reduction]])
k <- 30 # 30 is default
reduced.dims <- toupper(dim_reduction) # Milo capitalizes the dim reduc names
obj_milo <- buildGraph(obj_milo, k=k, d=d, reduced.dim=reduced.dims)
# Defining representative neighborhoods on the KNN graph
prop <- prop # "We suggest using prop=0.1 for datasets of less than 30k cells. For bigger datasets using prop=0.05 should be sufficient (and makes computation faster)"
obj_milo <- makeNhoods(obj_milo, prop=prop, k=k, d=d, refined=TRUE, reduced_dims=reduced.dims)
# Counting cells in neighborhoods (to get how many cells from each sample are in each neighborhood)
obj_milo <- countCells(obj_milo, meta.data=as.data.frame(colData(obj_milo)), sample=sample_name)
# Defining experimental design
# "We implement this hypothesis testing in a generalized linear model (GLM) framework,
# specifically using the Negative Binomial GLM implementation in edgeR.""
milo_design <- data.frame(colData(obj_milo))[,c(sample_name, contrast)]
colnames(milo_design) <- c("Sample", "contrast")
milo_design <- distinct(milo_design)
rownames(milo_design) <- milo_design$Sample
# Computing neighborhood connectivity
obj_milo <- calcNhoodDistance(obj_milo, d=d, reduced.dim=reduced.dims)
# Testing differential abundance
da_results <- testNhoods(obj_milo, design = ~ contrast, design.df=milo_design, reduced.dim=reduced.dims)
# Visualize results:
obj_milo <- buildNhoodGraph(obj_milo)
return(list(milo_obj=obj_milo, da_results=da_results))
}
#################################################################################