Merge pull request #195 from golobor/master

make pairsio.py to read (and, in the future, write) .pairs files
open2c · Mar 9, 2024 · 7cd8c9a · 7cd8c9a
2 parents f6574dd + aba0a53
commit 7cd8c9a
Show file tree

Hide file tree

Showing 8 changed files with 262 additions and 123 deletions.
diff --git a/doc/examples/scalings_example.ipynb b/doc/examples/scalings_example.ipynb
diff --git a/pairtools/cli/scaling.py b/pairtools/cli/scaling.py
@@ -5,7 +5,7 @@
 import click
 import pandas as pd
 
-from ..lib import fileio, pairsam_format, headerops
+from ..lib import fileio
 from . import cli, common_io_options
 
 from ..lib.scaling import compute_scaling
@@ -39,21 +39,21 @@
 @click.option(
     "--dist-range",
     type=click.Tuple([int, int]),
-    default=(10, 1_000_000_000),
+    default=(1, 1_000_000_000),
     show_default=True,
     required=False,
     help="Distance range. ",
 )
 @click.option(
-    "--n-dist-bins",
+    "--n-dist-bins-decade",
     type=int,
-    default=128,
+    default=8,
     show_default=True,
     required=False,
-    help="Number of distance bins to split the distance range. ",
+    help="Number of bins to split the distance range in log10-space, specified per a factor of 10 difference.",
 )
 @common_io_options
-def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwargs):
+def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs):
     """Calculate pairs scalings.
 
     INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
@@ -63,10 +63,10 @@ def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwar
 
     Output is .tsv file with scaling stats (both cis scalings and trans levels).
     """
-    scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwargs)
+    scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs)
 
 
-def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwargs):
+def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs):
 
     if len(input_path) == 0:
         raise ValueError(f"No input paths: {input_path}")
@@ -93,13 +93,13 @@ def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins, **k
         regions=view,
         chromsizes=None,
         dist_range=dist_range,
-        n_dist_bins=n_dist_bins,
+        n_dist_bins_decade=n_dist_bins_decade,
         chunksize=chunksize,
     )
     summary_stats = pd.concat([cis_scalings, trans_levels])
 
     # save statistics to the file
-    summary_stats.to_csv(outstream, sep="\t")
+    summary_stats.to_csv(outstream, sep="\t", index=False)
 
     if instream != sys.stdin:
         instream.close()

diff --git a/pairtools/lib/__init__.py b/pairtools/lib/__init__.py
@@ -2,6 +2,7 @@
 from . import dedup
 from . import filterbycov
 from . import headerops
+from . import pairsio
 from . import pairsam_format
 from . import parse
 from . import parse_pysam

diff --git a/pairtools/lib/fileio.py b/pairtools/lib/fileio.py
@@ -3,7 +3,6 @@
 import subprocess
 import sys
 
-
 class ParseError(Exception):
     pass
 
@@ -235,3 +234,32 @@ def close(self, timeout=None):
         self._stream.close()
         retcode = self._proc.wait(timeout=timeout)
         return retcode
+
+
+def get_stream_handlers(instream):
+    """
+    Get the readline and peek functions for the provided input stream.
+
+    Parameters:
+        instream (file-like object): The input stream to get the handlers for.
+
+    Returns:
+        tuple: A tuple containing the following elements:
+            - readline_f (function): The readline function for the input stream.
+            - peek_f (function): The peek function for the input stream.
+
+    Raises:
+        ValueError: If the peek function cannot be found for the provided stream.
+    """
+    readline_f, peek_f = None, None
+    if hasattr(instream, "buffer"):
+        peek_f = instream.buffer.peek
+        readline_f = instream.buffer.readline
+    elif hasattr(instream, "peek"):
+        peek_f = instream.peek
+        readline_f = instream.readline
+    else:
+        raise ValueError("Cannot find the peek() function of the provided stream!")
+    return readline_f, peek_f
+
+
diff --git a/pairtools/lib/headerops.py b/pairtools/lib/headerops.py
@@ -9,7 +9,7 @@
 
 from .. import __version__
 from . import pairsam_format
-from .fileio import ParseError
+from .fileio import ParseError, get_stream_handlers
 
 from .._logging import get_logger
 
@@ -21,19 +21,6 @@
 COMMENT_CHAR = "#"
 
 
-def get_stream_handlers(instream):
-    # get peekable buffer for the instream
-    readline_f, peek_f = None, None
-    if hasattr(instream, "buffer"):
-        peek_f = instream.buffer.peek
-        readline_f = instream.buffer.readline
-    elif hasattr(instream, "peek"):
-        peek_f = instream.peek
-        readline_f = instream.readline
-    else:
-        raise ValueError("Cannot find the peek() function of the provided stream!")
-    return readline_f, peek_f
-
 
 def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False):
     """Returns a header from the stream and an the reaminder of the stream

diff --git a/pairtools/lib/pairsio.py b/pairtools/lib/pairsio.py
@@ -0,0 +1,49 @@
+import pandas as pd
+
+from . import fileio, headerops
+
+def read_pairs(pairs, nproc=3, cmd_in=None, **kwargs):
+    """
+    Reads a file with .pairs format and returns a header, a dataframe of pairs, and chromsizes.
+
+    Parameters:
+        pairs (str or file-like object): A path to a .pairs file to read or an open file-like object/handle.
+        nproc (int): Number of processes to use for reading the file. Default is 3.
+        cmd_in (str): The command to be used for reading the file. Default is None.
+
+        **kwargs: Additional keyword arguments to be passed to pd.read_csv. Useful options include:
+            - chunksize (int): If specified, return an iterable object of type TextFileReader that reads in chunks of lines.
+            - usecols (list-like or callable): Return a subset of the columns. If list-like, all elements must either be positional or strings. If callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to True.
+
+    Returns:
+        tuple: A tuple containing the following elements:
+            - pairs_df (pd.DataFrame): A pandas DataFrame with pairs.
+            - header (list of str): The original header of the pairs file.
+            - chromsizes (dict): A dictionary containing chromosome sizes extracted from the header.
+    """
+    pairs_stream = (
+        fileio.auto_open(
+            pairs,
+            mode="r",
+            nproc=nproc,
+            command=cmd_in,
+        )
+        if isinstance(pairs, str)
+        else pairs
+    )
+
+    header, pairs_body = headerops.get_header(pairs_stream)
+    cols = headerops.extract_column_names(header)
+
+    chromsizes = headerops.extract_chromsizes(header)
+
+    pairs_df = pd.read_csv(
+        pairs_body,
+        header=None,
+        names=cols,
+        sep="\t",
+        dtype={"chrom1": str, "chrom2": str},
+        **kwargs
+    )
+
+    return pairs_df, header, chromsizes