Skip to content

Commit

Permalink
Merge pull request #195 from golobor/master
Browse files Browse the repository at this point in the history
make pairsio.py to read (and, in the future, write) .pairs files
  • Loading branch information
golobor authored Mar 9, 2024
2 parents f6574dd + aba0a53 commit 7cd8c9a
Show file tree
Hide file tree
Showing 8 changed files with 262 additions and 123 deletions.
126 changes: 85 additions & 41 deletions doc/examples/scalings_example.ipynb

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions pairtools/cli/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import click
import pandas as pd

from ..lib import fileio, pairsam_format, headerops
from ..lib import fileio
from . import cli, common_io_options

from ..lib.scaling import compute_scaling
Expand Down Expand Up @@ -39,21 +39,21 @@
@click.option(
"--dist-range",
type=click.Tuple([int, int]),
default=(10, 1_000_000_000),
default=(1, 1_000_000_000),
show_default=True,
required=False,
help="Distance range. ",
)
@click.option(
"--n-dist-bins",
"--n-dist-bins-decade",
type=int,
default=128,
default=8,
show_default=True,
required=False,
help="Number of distance bins to split the distance range. ",
help="Number of bins to split the distance range in log10-space, specified per a factor of 10 difference.",
)
@common_io_options
def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwargs):
def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs):
"""Calculate pairs scalings.
INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
Expand All @@ -63,10 +63,10 @@ def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwar
Output is .tsv file with scaling stats (both cis scalings and trans levels).
"""
scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwargs)
scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs)


def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwargs):
def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs):

if len(input_path) == 0:
raise ValueError(f"No input paths: {input_path}")
Expand All @@ -93,13 +93,13 @@ def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins, **k
regions=view,
chromsizes=None,
dist_range=dist_range,
n_dist_bins=n_dist_bins,
n_dist_bins_decade=n_dist_bins_decade,
chunksize=chunksize,
)
summary_stats = pd.concat([cis_scalings, trans_levels])

# save statistics to the file
summary_stats.to_csv(outstream, sep="\t")
summary_stats.to_csv(outstream, sep="\t", index=False)

if instream != sys.stdin:
instream.close()
Expand Down
1 change: 1 addition & 0 deletions pairtools/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from . import dedup
from . import filterbycov
from . import headerops
from . import pairsio
from . import pairsam_format
from . import parse
from . import parse_pysam
Expand Down
30 changes: 29 additions & 1 deletion pairtools/lib/fileio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import subprocess
import sys


class ParseError(Exception):
pass

Expand Down Expand Up @@ -235,3 +234,32 @@ def close(self, timeout=None):
self._stream.close()
retcode = self._proc.wait(timeout=timeout)
return retcode


def get_stream_handlers(instream):
"""
Get the readline and peek functions for the provided input stream.
Parameters:
instream (file-like object): The input stream to get the handlers for.
Returns:
tuple: A tuple containing the following elements:
- readline_f (function): The readline function for the input stream.
- peek_f (function): The peek function for the input stream.
Raises:
ValueError: If the peek function cannot be found for the provided stream.
"""
readline_f, peek_f = None, None
if hasattr(instream, "buffer"):
peek_f = instream.buffer.peek
readline_f = instream.buffer.readline
elif hasattr(instream, "peek"):
peek_f = instream.peek
readline_f = instream.readline
else:
raise ValueError("Cannot find the peek() function of the provided stream!")
return readline_f, peek_f


15 changes: 1 addition & 14 deletions pairtools/lib/headerops.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from .. import __version__
from . import pairsam_format
from .fileio import ParseError
from .fileio import ParseError, get_stream_handlers

from .._logging import get_logger

Expand All @@ -21,19 +21,6 @@
COMMENT_CHAR = "#"


def get_stream_handlers(instream):
# get peekable buffer for the instream
readline_f, peek_f = None, None
if hasattr(instream, "buffer"):
peek_f = instream.buffer.peek
readline_f = instream.buffer.readline
elif hasattr(instream, "peek"):
peek_f = instream.peek
readline_f = instream.readline
else:
raise ValueError("Cannot find the peek() function of the provided stream!")
return readline_f, peek_f


def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False):
"""Returns a header from the stream and an the reaminder of the stream
Expand Down
49 changes: 49 additions & 0 deletions pairtools/lib/pairsio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pandas as pd

from . import fileio, headerops

def read_pairs(pairs, nproc=3, cmd_in=None, **kwargs):
"""
Reads a file with .pairs format and returns a header, a dataframe of pairs, and chromsizes.
Parameters:
pairs (str or file-like object): A path to a .pairs file to read or an open file-like object/handle.
nproc (int): Number of processes to use for reading the file. Default is 3.
cmd_in (str): The command to be used for reading the file. Default is None.
**kwargs: Additional keyword arguments to be passed to pd.read_csv. Useful options include:
- chunksize (int): If specified, return an iterable object of type TextFileReader that reads in chunks of lines.
- usecols (list-like or callable): Return a subset of the columns. If list-like, all elements must either be positional or strings. If callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to True.
Returns:
tuple: A tuple containing the following elements:
- pairs_df (pd.DataFrame): A pandas DataFrame with pairs.
- header (list of str): The original header of the pairs file.
- chromsizes (dict): A dictionary containing chromosome sizes extracted from the header.
"""
pairs_stream = (
fileio.auto_open(
pairs,
mode="r",
nproc=nproc,
command=cmd_in,
)
if isinstance(pairs, str)
else pairs
)

header, pairs_body = headerops.get_header(pairs_stream)
cols = headerops.extract_column_names(header)

chromsizes = headerops.extract_chromsizes(header)

pairs_df = pd.read_csv(
pairs_body,
header=None,
names=cols,
sep="\t",
dtype={"chrom1": str, "chrom2": str},
**kwargs
)

return pairs_df, header, chromsizes
Loading

0 comments on commit 7cd8c9a

Please sign in to comment.