Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support custom contig sep characters in pairix index #398

Merged
merged 3 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion src/cooler/cli/cload.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,24 @@ def tabix(
default=2,
show_default=True,
)
@click.option(
"--block-char",
help="Character separating contig names in the block names of the pairix "
"index.",
type=str,
default="|",
show_default=True,
)
def pairix(
bins, pairs_path, cool_path, metadata, assembly, nproc, zero_based, max_split
bins,
pairs_path,
cool_path,
metadata,
assembly,
nproc,
zero_based,
max_split,
block_char,
):
"""
Bin a pairix-indexed contact list file.
Expand Down Expand Up @@ -326,6 +342,7 @@ def pairix(
map=map_func,
is_one_based=(not zero_based),
n_chunks=max_split,
block_char=block_char,
)

create_cooler(
Expand Down
34 changes: 24 additions & 10 deletions src/cooler/create/_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from bisect import bisect_left
from collections import Counter, OrderedDict
from functools import partial
from typing import Any, Callable
from typing import Any, Callable, Iterator

import h5py
import numpy as np
Expand Down Expand Up @@ -514,7 +514,7 @@ def __getstate__(self) -> dict:
d.pop("_map", None)
return d

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
"""Iterator over chunks of binned contacts (i.e., nonzero pixels)

Chunks are expected to have the following format:
Expand Down Expand Up @@ -633,7 +633,7 @@ def aggregate(self, chrom: str) -> pd.DataFrame: # pragma: no cover
def size(self) -> int:
return len(self.h5["chrms1"])

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
for chrom in self.gs.contigs:
for df in self.aggregate(chrom):
yield {k: v.values for k, v in df.items()}
Expand Down Expand Up @@ -700,7 +700,9 @@ def __init__(
"ordering of read ends in the contact list file."
)

def aggregate(self, grange): # pragma: no cover
def aggregate(
self, grange: tuple[str, int, int]
) -> pd.DataFrame | None: # pragma: no cover
chrom1, start, end = grange
import pysam

Expand Down Expand Up @@ -771,7 +773,7 @@ def aggregate(self, grange): # pragma: no cover

return pd.concat(rows, axis=0) if len(rows) else None

def __iter__(self):
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
granges = balanced_partition(self.gs, self.n_chunks, self.file_contigs)
for df in self._map(self.aggregate, granges):
if df is not None:
Expand All @@ -793,6 +795,7 @@ def __init__(
map: MapFunctor = map,
n_chunks: int = 1,
is_one_based: bool = False,
block_char: str = "|",
**kwargs,
):
try:
Expand All @@ -811,13 +814,24 @@ def __init__(
self._map = map
self.n_chunks = n_chunks
self.is_one_based = bool(is_one_based)
self.block_char = block_char

f = pypairix.open(filepath, "r")
self.C1 = f.get_chr1_col()
self.C2 = f.get_chr2_col()
self.P1 = f.get_startpos1_col()
self.P2 = f.get_startpos2_col()
blocknames = f.get_blocknames()
if block_char not in blocknames[0]:
raise ValueError(
f"The contig separator character `{block_char}` does not "
f"appear in the first block name `{blocknames[0]}`. Please "
"specify the correct character as `block_char`."
)
self.file_contigs = set(
itertools.chain.from_iterable([b.split("|") for b in f.get_blocknames()])
itertools.chain.from_iterable([
blockname.split(block_char) for blockname in blocknames
])
)

if not len(self.file_contigs):
Expand All @@ -826,7 +840,7 @@ def __init__(
if f.exists2(c1, c2) and f.exists2(c2, c1):
raise RuntimeError(
"Pairs are not triangular: found blocks "
+ f"'{c1}|{c2}'' and '{c2}|{c1}'"
+ f"'{c1}{block_char}{c2}'' and '{c2}{block_char}{c1}'"
)

# dumb heuristic to prevent excessively large chunks on one worker
Expand Down Expand Up @@ -942,7 +956,7 @@ def aggregate(

return pd.concat(rows, axis=0) if len(rows) else None

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
granges = balanced_partition(self.gs, self.n_chunks, self.file_contigs)
for df in self._map(self.aggregate, granges):
if df is not None:
Expand Down Expand Up @@ -981,7 +995,7 @@ def select_block(self, chrom1, chrom2):
raise
return block

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
# n_bins = len(self.bins)

import scipy.sparse
Expand Down Expand Up @@ -1036,7 +1050,7 @@ def __init__(
self.array = array
self.chunksize = chunksize

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
n_bins = self.array.shape[0]
spans = partition(0, n_bins, self.chunksize)

Expand Down
19 changes: 18 additions & 1 deletion tests/test_create_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from cooler.cli.load import load

pysam_missing = importlib.util.find_spec("pysam") is None
pairix_missing = importlib.util.find_spec("pairix") is None
pairix_missing = importlib.util.find_spec("pypairix") is None
_pandas_major_version = int(pd.__version__.split(".")[0])

tmp = tempfile.gettempdir()
Expand Down Expand Up @@ -246,6 +246,7 @@ def test_cload_pairix(bins_path, pairs_path, ref_path, nproc):
nproc=nproc,
zero_based=False,
max_split=2,
block_char="|",
)
with h5py.File(testcool_path, "r") as f1, h5py.File(ref_path, "r") as f2:
assert np.all(f1["pixels/bin1_id"][:] == f2["pixels/bin1_id"][:])
Expand All @@ -257,6 +258,22 @@ def test_cload_pairix(bins_path, pairs_path, ref_path, nproc):
pass


@pytest.mark.skipif(pairix_missing, reason="pairix not installed")
def test_cload_pairix_wrong_block_char():
with pytest.raises(ValueError):
cload_pairix.callback(
op.join(testdir, "data", "hg19.bins.2000kb.bed.gz"),
op.join(testdir, "data", "hg19.GM12878-MboI.pairs.subsample.blksrt.txt.gz"),
testcool_path,
metadata=None,
assembly="hg19",
nproc=1,
zero_based=False,
max_split=2,
block_char="?",
)


@pytest.mark.skipif(
_pandas_major_version < 1, reason="hash fix only works with pandas >= 1.0"
)
Expand Down