diff --git a/.travis.yml b/.travis.yml index d5fde17..1d4dfbe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,8 +30,8 @@ before_install: - export HIC_TEST_DATA_DIR="`pwd`/hicmatrix/test/test_data/" - echo $HIC_TEST_DATA_DIR -- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o miniconda.sh ; fi -- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then curl https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o miniconda.sh ; fi +- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then curl -L https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o miniconda.sh ; fi +- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then curl -L https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o miniconda.sh ; fi - bash miniconda.sh -b -p $HOME/miniconda - PATH_WITHOUT_CONDA="$PATH" - export PATH="$HOME/miniconda/bin:$PATH" diff --git a/hicmatrix/HiCMatrix.py b/hicmatrix/HiCMatrix.py index 39acf73..1edb9ac 100644 --- a/hicmatrix/HiCMatrix.py +++ b/hicmatrix/HiCMatrix.py @@ -27,7 +27,7 @@ class hiCMatrix: get sub matrices by chrname. """ - def __init__(self, pMatrixFile=None, pChrnameList=None): + def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None, pNoIntervalTree=None, pUpperTriangleOnly=None): self.non_homogeneous_warning_already_printed = False self.bin_size = None self.bin_size_homogeneous = None # track if the bins are equally spaced or not @@ -49,40 +49,30 @@ def __init__(self, pMatrixFile=None, pChrnameList=None): fileType = 'cool' if pMatrixFile.endswith('.h5'): fileType = 'h5' - self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList) + self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList, pDistance=pDistance) log.debug('init time: {}'.format(time.time() - start_time)) - self.matrix, self.cut_intervals, self.nan_bins, \ - self.correction_factors, self.distance_counts = self.matrixFileHandler.load() - # if len(self.matrix.data) == 0: - # log.warning('No data for {}, not initialization of object. '.format(pChrnameList)) - # self.interval_trees = None - # self.chrBinBoundaries = None - # return - log.debug('load time: {}'.format(time.time() - start_time)) - start_time = time.time() - - log.debug('data loaded from file handler') + matrixFileHandler_load = self.matrixFileHandler.load() + # check if there was any exception thrown in the load function + if len(matrixFileHandler_load) == 2: + raise Exception('Matrix failed to load: {}'.format(str(matrixFileHandler_load[1]))) + else: + self.matrix, self.cut_intervals, self.nan_bins, \ + self.correction_factors, self.distance_counts = matrixFileHandler_load if self.nan_bins is None: self.nan_bins = np.array([]) - self.fillLowerTriangle() - log.debug('triangle time: {}'.format(time.time() - start_time)) + if pUpperTriangleOnly is None or not pUpperTriangleOnly: + self.fillLowerTriangle() start_time = time.time() - log.debug('fillLowerTriangle') - self.restoreMaskedBins() - log.debug('restoreMaskedBins: {}'.format(time.time() - start_time)) start_time = time.time() - log.debug('restoreMaskedBins') - - self.interval_trees, self.chrBinBoundaries = \ - self.intervalListToIntervalTree(self.cut_intervals) - log.debug('intervalListToIntervalTree: {}'.format(time.time() - start_time)) - start_time = time.time() - - log.debug('intervalListToIntervalTree') + if pNoIntervalTree is None or not pNoIntervalTree: + self.interval_trees, self.chrBinBoundaries = \ + self.intervalListToIntervalTree(self.cut_intervals) + else: + log.debug('no intervaltree') elif pMatrixFile is None: log.debug('Only init object, no matrix given.') @@ -1016,6 +1006,6 @@ def intervalListToIntervalTree(self, interval_list): def check_cooler(pFileName): - if pFileName.endswith('.cool') or cooler.io.is_cooler(pFileName) or'.mcool' in pFileName: + if pFileName.endswith('.cool') or cooler.io.is_cooler(pFileName) or '.mcool' in pFileName: return True return False diff --git a/hicmatrix/__init__.py b/hicmatrix/__init__.py index 7d33780..7ee7855 100644 --- a/hicmatrix/__init__.py +++ b/hicmatrix/__init__.py @@ -1,3 +1,5 @@ import logging logging.basicConfig(level=logging.INFO) +# logging.basicConfig(level=logging.DEBUG) + logging.getLogger('cooler').setLevel(logging.WARNING) diff --git a/hicmatrix/_version.py b/hicmatrix/_version.py index e2fe542..0626305 100644 --- a/hicmatrix/_version.py +++ b/hicmatrix/_version.py @@ -1,2 +1,2 @@ -__version__ = '12' +__version__ = '13' # Version number differs from HiCExplorer! diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index fd6182b..af18f86 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -6,15 +6,16 @@ import math import time +import gc import cooler import h5py import numpy as np -from scipy.sparse import triu, csr_matrix +from scipy.sparse import triu, csr_matrix, lil_matrix import pandas as pd from hicmatrix.utilities import toString, toBytes -from hicmatrix.utilities import convertNansToOnes +from hicmatrix.utilities import convertNansToOnes, convertNansToZeros from hicmatrix._version import __version__ from .matrixFile import MatrixFile @@ -31,14 +32,12 @@ def __init__(self, pMatrixFile=None): self.appendData = False self.fileWasH5 = False self.applyCorrectionLoad = True - # self.hic_info = {} self.hic_metadata = {} self.cool_info = None self.hic2cool_version = None self.hicmatrix_version = None - self.scaleToOriginalRange = None - # self.correction_factors = None + self.distance = None def getInformationCoolerBinNames(self): return cooler.Cooler(self.matrixFileName).bins().columns.values @@ -48,8 +47,7 @@ def load(self): self.minValue = None self.maxValue = None if self.matrixFileName is None: - log.info('No matrix is initialized') - + log.warning('No matrix is initialized') try: cooler_file = cooler.Cooler(self.matrixFileName) if 'metadata' in cooler_file.info: @@ -57,16 +55,12 @@ def load(self): else: self.hic_metadata = None self.cool_info = deepcopy(cooler_file.info) - # log.debug("cooler_file.info {}".format(cooler_file.info)) except Exception as e: - log.info("Could not open cooler file. Maybe the path is wrong or the given node is not available.") - log.info('The following file was tried to open: {}'.format(self.matrixFileName)) - log.info("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0]))) - e - log.debug('self.chrnameList {}'.format(self.chrnameList)) + log.warning("Could not open cooler file. Maybe the path is wrong or the given node is not available.") + log.warning('The following file was tried to open: {}'.format(self.matrixFileName)) + log.warning("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0]))) + return None, e if self.chrnameList is None: - log.debug('muh 69') - matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) used_dtype = np.int32 if np.iinfo(np.int32).max < cooler_file.info['nbins']: @@ -94,29 +88,44 @@ def load(self): del _data del _instances del _features - matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype) - self.minValue = data.min() - self.maxValue = data.max() del data del instances del features + gc.collect() + else: if len(self.chrnameList) == 1: try: - log.debug('Load data') - matrix = cooler_file.matrix(balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr() - # handle the case of an empty csr matrix - if len(matrix.data) == 0: - self.minValue = 0 - self.maxValue = 0 + if self.distance is None or cooler_file.binsize is None: + # load the full chromosome + matrix = cooler_file.matrix(balance=False, sparse=True, as_pixels=False).fetch(self.chrnameList[0]).tocsr() else: - self.minValue = matrix.data.min() - self.maxValue = matrix.data.max() + # load only the values up to a specific distance + lo, hi = cooler_file.extent(self.chrnameList[0]) + dist = self.distance // cooler_file.binsize + step = (hi - lo) // 32 + if step < 1: + step = 1 + mat = lil_matrix((hi - lo, hi - lo), dtype=np.float32) + + for i0, i1 in cooler.util.partition(lo, hi, step): + # fetch stripe + pixels = cooler_file.matrix(balance=False, as_pixels=True)[i0:i1, lo:hi] + # filter + pixels = pixels[(pixels['bin2_id'] - pixels['bin1_id']) < dist] + # insert into sparse matrix + mat[pixels['bin1_id'] - lo, pixels['bin2_id'] - lo] = pixels['count'].astype(np.float32) + del pixels + + matrix = mat.tocsr() + del mat + gc.collect() + except ValueError as ve: log.exception("Wrong chromosome format. Please check UCSC / ensembl notation.") - ve + log.exception('Error: {}'.format(str(ve))) else: raise Exception("Operation to load more as one region is not supported.") @@ -126,7 +135,7 @@ def load(self): if self.chrnameList is not None: if len(self.chrnameList) == 1: cut_intervals_data_frame = cooler_file.bins().fetch(self.chrnameList[0]) - + log.debug('cut_intervals_data_frame {}'.format(list(cut_intervals_data_frame.columns))) if self.correctionFactorTable in cut_intervals_data_frame: correction_factors_data_frame = cut_intervals_data_frame[self.correctionFactorTable] else: @@ -146,9 +155,10 @@ def load(self): matrix.data = matrix.data.astype(float) - correction_factors = convertNansToOnes(np.array(correction_factors_data_frame.values).flatten()) - # apply only if there are not only 1's - if np.sum(correction_factors) != len(correction_factors): + correction_factors = np.array(correction_factors_data_frame.values).flatten() + # Don't apply correction if weight were just 'nans' + if np.sum(np.isnan(correction_factors)) != len(correction_factors): + # correction_factors = convertNansToZeros(correction_factors) matrix.sort_indices() instances, features = matrix.nonzero() @@ -156,34 +166,17 @@ def load(self): features_factors = correction_factors[features] if self.correctionOperator is None: + if self.correctionFactorTable in ['KR', 'VC', 'SQRT_VC']: + self.correctionOperator = '/' + else: + self.correctionOperator = '*' if 'generated-by' in cooler_file.info: log.debug('cooler_file.info[\'generated-by\'] {} {}'.format(cooler_file.info['generated-by'], type(cooler_file.info['generated-by']))) generated_by = toString(cooler_file.info['generated-by']) if 'hic2cool' in generated_by: - self.hic2cool_version = generated_by.split('-')[1] - if self.hic2cool_version >= '0.5': - log.debug('0.5') - self.correctionOperator = '/' - else: - log.debug('0.4') - - self.correctionOperator = '*' - else: - self.correctionOperator = '*' - - log.debug('hic2cool: {}'.format(self.hic2cool_version)) - log.debug('self.correctionOperator : {}'.format(self.correctionOperator)) - - # elif 'hicmatrix' in generated_by: - - # self.hicmatrix_version = generated_by.split('-')[1] - # if self.hicmatrix_version >= '8': - # self.correctionOperator = '/' - # else: - # self.correctionOperator = '*' - else: - self.correctionOperator = '*' + elif 'hicmatrix' in generated_by: + self.hicmatrix_version = generated_by.split('-')[1] instances_factors *= features_factors log.debug('hic2cool: {}'.format(self.hic2cool_version)) @@ -193,41 +186,27 @@ def load(self): elif self.correctionOperator == '/': matrix.data /= instances_factors - # if self.scaleToOriginalRange is not None: - min_value = matrix.data.min() - max_value = matrix.data.max() - # check if max smaller one or if not same mangnitude - if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1): - desired_range_difference = self.maxValue - self.minValue - - min_value = matrix.data.min() - max_value = matrix.data.max() - - matrix.data = (matrix.data - min_value) - matrix.data /= (max_value - min_value) - matrix.data *= desired_range_difference - matrix.data += self.minValue - self.scaleToOriginalRange = True - # diff_scale_factor = matrix.data.max() / max_value - # if self.correctionOperator == '*': - # correction_factors *= diff_scale_factor - # if self.correctionOperator == '/': - # correction_factors /= diff_scale_factor - cut_intervals = [] - time_start = time.time() - log.debug('Creating cut_intervals {}'.format(time_start)) for values in cut_intervals_data_frame.values: cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0])) - log.debug('Creating cut_intervals {} DONE'.format(time.time() - time_start)) del cut_intervals_data_frame del correction_factors_data_frame # try to restore nan_bins. try: + # remove possible nan bins introduced by the correction factors + # to have them part of the nan_bins vector + mask = np.isnan(matrix.data) + matrix.data[mask] = 0 + matrix.eliminate_zeros() shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1] - nan_bins = np.arange(shape) - nan_bins = np.setdiff1d(nan_bins, matrix.indices) - + nan_bins_indices = np.arange(shape) + nan_bins_indices = np.setdiff1d(nan_bins_indices, matrix.indices) + + nan_bins = [] + for bin_id in nan_bins_indices: + if len(matrix[bin_id, :].data) == 0: + nan_bins.append(bin_id) + nan_bins = np.array(nan_bins) except Exception: nan_bins = None @@ -276,9 +255,12 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': np.int32} if self.correction_factors is not None and pApplyCorrection: dtype_pixel['weight'] = np.float32 - if (self.hic2cool_version is not None and self.hic2cool_version >= '0.5') or self.fileWasH5: - log.debug('wash5 true') + # if the correction was applied by a division, invert it because cool format expects multiplicative if table name is 'weight' + # https://cooler.readthedocs.io/en/latest/api.html#cooler.Cooler.matrix + if (self.hic2cool_version is not None and self.hic2cool_version >= '0.5') or self.fileWasH5 or self.correctionOperator == '/': + + log.debug('h5 true') self.correction_factors = np.array(self.correction_factors).flatten() self.correction_factors = 1 / self.correction_factors mask = np.isnan(self.correction_factors) @@ -290,7 +272,7 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): weight = convertNansToOnes(np.array(self.correction_factors).flatten()) bins_data_frame = bins_data_frame.assign(weight=weight) - log.info("Reverting correction factors on matrix...") + log.debug("Reverting correction factors on matrix...") instances, features = self.matrix.nonzero() self.correction_factors = np.array(self.correction_factors) @@ -303,32 +285,14 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): self.matrix.data = self.matrix.data.astype(float) # Apply the invert operation to get the original data - log.debug('self.correctionOperator: {}'.format(self.correctionOperator)) - log.debug('self.fileWasH5: {}'.format(self.fileWasH5)) - - if self.scaleToOriginalRange: - min_value = self.matrix.data.min() - max_value = self.matrix.data.max() - desired_range_difference = max_value - min_value - - self.matrix.data = (self.matrix.data - self.minValue) - self.matrix.data /= (self.maxValue - self.minValue) - self.matrix.data *= desired_range_difference - self.matrix.data += min_value - if self.correctionOperator == '*' or self.correctionOperator is None: self.matrix.data /= instances_factors - elif self.correctionOperator == '/' or self.fileWasH5: - self.matrix.data *= instances_factors instances_factors = None features_factors = None self.matrix.eliminate_zeros() - log.debug('self.correction_factors {}'.format(self.correction_factors)) - log.debug('pApplyCorrection {}'.format(pApplyCorrection)) - if self.correction_factors is not None and pApplyCorrection is False: dtype_pixel['weight'] = np.float32 weight = convertNansToOnes(np.array(self.correction_factors).flatten()) @@ -349,7 +313,7 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): matrix_data_frame = matrix_data_frame.assign(count=self.matrix.data) if not self.enforceInteger and self.matrix.dtype not in [np.int32, int]: - log.warning("Writing non-standard cooler matrix. Datatype of matrix['count'] is: {}".format(self.matrix.dtype)) + log.debug("Writing non-standard cooler matrix. Datatype of matrix['count'] is: {}".format(self.matrix.dtype)) dtype_pixel['count'] = self.matrix.dtype split_factor = 1 if len(self.matrix.data) > 1e7: @@ -372,12 +336,12 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): if 'bin-type' in info: del info['bin-type'] - info['format'] = np.string_('HDF5::Cooler') - info['format-url'] = np.string_('https://github.com/mirnylab/cooler') - info['generated-by'] = np.string_('HiCMatrix-' + __version__) - info['generated-by-cooler-lib'] = np.string_('cooler-' + cooler.__version__) + info['format'] = str('HDF5::Cooler') + info['format-url'] = str('https://github.com/mirnylab/cooler') + info['generated-by'] = str('HiCMatrix-' + __version__) + info['generated-by-cooler-lib'] = str('cooler-' + cooler.__version__) - info['tool-url'] = np.string_('https://github.com/deeptools/HiCMatrix') + info['tool-url'] = str('https://github.com/deeptools/HiCMatrix') # info['nchroms'] = int(bins_data_frame['chrom'][:].nunique()) # info['chromosomes'] = list(bins_data_frame['chrom'][:].unique()) @@ -387,13 +351,13 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): # info['sum-elements'] = int(matrix_data_frame['count'].sum()) if self.hic_metadata is not None and 'matrix-generated-by' in self.hic_metadata: - info['matrix-generated-by'] = np.string_(self.hic_metadata['matrix-generated-by']) + info['matrix-generated-by'] = str(self.hic_metadata['matrix-generated-by']) del self.hic_metadata['matrix-generated-by'] if self.hic_metadata is not None and 'matrix-generated-by-url' in self.hic_metadata: - info['matrix-generated-by-url'] = np.string_(self.hic_metadata['matrix-generated-by-url']) + info['matrix-generated-by-url'] = str(self.hic_metadata['matrix-generated-by-url']) del self.hic_metadata['matrix-generated-by-url'] if self.hic_metadata is not None and 'genome-assembly' in self.hic_metadata: - info['genome-assembly'] = np.string_(self.hic_metadata['genome-assembly']) + info['genome-assembly'] = str(self.hic_metadata['genome-assembly']) del self.hic_metadata['genome-assembly'] local_temp_dir = os.path.dirname(os.path.realpath(pFileName)) diff --git a/hicmatrix/lib/homer.py b/hicmatrix/lib/homer.py index 9b60a1e..aad6232 100644 --- a/hicmatrix/lib/homer.py +++ b/hicmatrix/lib/homer.py @@ -5,6 +5,7 @@ from scipy.sparse import csr_matrix from .matrixFile import MatrixFile +from hicmatrix.utilities import opener class Homer(MatrixFile, object): @@ -15,26 +16,27 @@ def __init__(self, pMatrixFile): def load(self): cut_intervals = [] - with open(self.matrixFileName, 'r') as matrix_file: + # matrix_file = opener(self.matrixFileName) + with opener(self.matrixFileName) as matrix_file: values = matrix_file.readline() - values = values.strip().split('\t') + values = values.strip().split(b'\t') # get bin size - start_first = int(values[2].strip().split('-')[1]) - start_second = int(values[3].strip().split('-')[1]) + start_first = int(values[2].strip().split(b'-')[1]) + start_second = int(values[3].strip().split(b'-')[1]) bin_size = start_second - start_first for i, value in enumerate(values[2:]): - chrom, start = value.strip().split('-') - cut_intervals.append((chrom, int(start), int(start) + bin_size, 1)) + chrom, start = value.strip().split(b'-') + cut_intervals.append((chrom.decode('ascii'), int(start), int(start) + bin_size, 1)) matrix_dense = [] for line in matrix_file: - values = line.split('\t') + values = line.split(b'\t') data = [] for i, value in enumerate(values[2:]): data.append(float(value)) matrix_dense.append(data) - + # matrix_file.close() matrix = csr_matrix(matrix_dense) nan_bins = None distance_counts = None diff --git a/hicmatrix/lib/matrixFileHandler.py b/hicmatrix/lib/matrixFileHandler.py index 233d51e..dd8a5fc 100644 --- a/hicmatrix/lib/matrixFileHandler.py +++ b/hicmatrix/lib/matrixFileHandler.py @@ -10,7 +10,8 @@ class MatrixFileHandler(): def __init__(self, pFileType='cool', pMatrixFile=None, pChrnameList=None, pApplyCorrectionCoolerLoad=None, pBedFileHicPro=None, pCorrectionFactorTable=None, - pCorrectionOperator=None, pEnforceInteger=None, pAppend=None, pFileWasH5=None, pHiCInfo=None, pHic2CoolVersion=None): + pCorrectionOperator=None, pEnforceInteger=None, pAppend=None, pFileWasH5=None, pHiCInfo=None, pHic2CoolVersion=None, + pDistance=None): self.class_ = getattr(importlib.import_module('.' + pFileType.lower(), package='hicmatrix.lib'), pFileType.title()) @@ -39,6 +40,9 @@ def __init__(self, pFileType='cool', pMatrixFile=None, pChrnameList=None, log.debug('pHic2CoolVersion : {}'.format(pHic2CoolVersion)) if pHic2CoolVersion is not None: self.matrixFile.hic2cool_version = pHic2CoolVersion + if pDistance is not None: + self.matrixFile.distance = pDistance + log.debug('self.distance {}'.format(self.matrixFile.distance)) def load(self): diff --git a/hicmatrix/test/test_HiCMatrix.py b/hicmatrix/test/test_HiCMatrix.py index c1583f2..181c9f0 100644 --- a/hicmatrix/test/test_HiCMatrix.py +++ b/hicmatrix/test/test_HiCMatrix.py @@ -5,7 +5,7 @@ from os import unlink import warnings from six import iteritems -from past.builtins import zip +# from past.builtins import zip from collections import OrderedDict from intervaltree import IntervalTree, Interval @@ -35,8 +35,8 @@ def test_load_h5_save_and_load_cool(): hic_cool = hm.hiCMatrix(outfile.name) nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) - chrom_cool, start_cool, end_cool, _ = zip(*hic_cool.cut_intervals) - chrom, start, end, _ = zip(*hic_cool.cut_intervals) + chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) + chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) nt.assert_equal(chrom_cool, chrom) nt.assert_equal(start_cool, start) @@ -53,8 +53,8 @@ def test_load_h5_save_and_load_cool_2(): hic_cool = hm.hiCMatrix(outfile.name) nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) - chrom_cool, start_cool, end_cool, _ = zip(*hic_cool.cut_intervals) - chrom, start, end, _ = zip(*hic_cool.cut_intervals) + chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) + chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) nt.assert_equal(chrom_cool, chrom) nt.assert_equal(start_cool, start) @@ -71,8 +71,8 @@ def test_load_cool_save_and_load_h5(): hic_cool = hm.hiCMatrix(outfile.name) nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) - chrom_cool, start_cool, end_cool, _ = zip(*hic_cool.cut_intervals) - chrom, start, end, _ = zip(*hic_cool.cut_intervals) + chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) + chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) nt.assert_equal(chrom_cool, chrom) nt.assert_equal(start_cool, start) @@ -201,8 +201,8 @@ def test_convert_to_zscore_matrix_2(): m_size = mat.shape[0] # compute matrix values per distance - chrom, start, end, extra = zip( - *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals)) + chrom, start, end, extra = list(zip( + *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals))) dist_values = {} sys.stderr.write("Computing values per distance for each matrix entry\n") @@ -1146,8 +1146,7 @@ def test_create_from_cool(): hic_ma = hm.hiCMatrix(ROOT + 'one_interaction_4chr.cool') nt.assert_equal(sorted(hic_ma.matrix.indices), [0, 3]) nt.assert_equal(sorted(hic_ma.matrix.data), [1, 1]) - # This is failing: - # nt.assert_equal(sorted(hic_ma.nan_bins)[:5], [1, 2, 4, 5, 6]) + nt.assert_equal(sorted(hic_ma.nan_bins)[:5], [1, 2, 4, 5, 6]) hic_ma = hm.hiCMatrix(ROOT + 'one_interaction_diag_4chr.cool') nt.assert_equal(sorted(hic_ma.matrix.indices), [0]) nt.assert_equal(sorted(hic_ma.matrix.data), [1]) diff --git a/hicmatrix/test/test_data/test_matrix.homer.gz b/hicmatrix/test/test_data/test_matrix.homer.gz new file mode 100644 index 0000000..351a6bd Binary files /dev/null and b/hicmatrix/test/test_data/test_matrix.homer.gz differ diff --git a/hicmatrix/test/test_matrixFileHandler.py b/hicmatrix/test/test_matrixFileHandler.py index 7f184c1..695d1de 100644 --- a/hicmatrix/test/test_matrixFileHandler.py +++ b/hicmatrix/test/test_matrixFileHandler.py @@ -11,7 +11,7 @@ outfile = '/tmp/matrix' -def test_load_homer(capsys): +def test_load_homer(): # create matrixFileHandler instance with filetype 'homer' pMatrixFile = ROOT + 'test_matrix.homer' fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) @@ -35,6 +35,30 @@ def test_load_homer(capsys): assert correction_factors is None +def test_load_homer_gzip(): + # create matrixFileHandler instance with filetype 'homer' + pMatrixFile = ROOT + 'test_matrix.homer.gz' + fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) + assert fh is not None + + # load data + matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() + + # create test matrix + + test_matrix = np.array([[1.0, 0.1896, 0.2163, 0.08288, 0.1431, 0.2569, 0.1315, + 0.1488, -0.0312, 0.143, 0.06091, 0.03546, 0.1168]]) + + nt.assert_almost_equal(matrix[0].todense(), test_matrix) + + test_cut_intervals = [('3R', 1000000, 1020000, 1), ('3R', 1020000, 1040000, 1), ('3R', 1040000, 1060000, 1), ('3R', 1060000, 1080000, 1), ('3R', 1080000, 1100000, 1), ('3R', 1100000, 1120000, 1), ('3R', 1120000, 1140000, 1), ('3R', 1140000, 1160000, 1), ('3R', 1160000, 1180000, 1), ('3R', 1180000, 1200000, 1), ('3R', 1200000, 1220000, 1), ('3R', 1220000, 1240000, 1), ('3R', 1240000, 1260000, 1)] # noqa E501 + nt.assert_equal(cut_intervals, test_cut_intervals) + + assert nan_bins is None + assert distance_counts is None + assert correction_factors is None + + def test_save_homer(): homer_outfile = outfile + '.homer' @@ -207,18 +231,18 @@ def test_load_cool2(capsys): nt.assert_almost_equal(matrix[0].todense(), test_matrix) test_cut_intervals = sum([[('chr1', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3909)], - [('chr1', 195450000, 195471971, 1.0)], - [('chrX', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3420)], - [('chrX', 171000000, 171031299, 1.0)], - [('chrY', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(1834)], - [('chrY', 91700000, 91744698, 1.0)], - [('chrM', 0, 16299, 1.0)]], []) + [('chr1', 195450000, 195471971, 1.0)], + [('chrX', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3420)], + [('chrX', 171000000, 171031299, 1.0)], + [('chrY', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(1834)], + [('chrY', 91700000, 91744698, 1.0)], + [('chrM', 0, 16299, 1.0)]], []) for index, tup in enumerate(cut_intervals): for ind, element in enumerate(tup): assert element == test_cut_intervals[index][ind] - test_nan_bins = [0, 1, 2, 4] + test_nan_bins = [1, 2, 4, 5] nt.assert_almost_equal(nan_bins[:4], test_nan_bins) assert distance_counts is None @@ -253,6 +277,52 @@ def test_save_cool(): os.unlink(cool_outfile) +def test_load_distance_cool(): + cool_outfile = outfile + '.cool' + + # create matrixFileHandler instance with filetype 'cool' + pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' + fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pChrnameList=['1'], pDistance=2500000) + assert fh is not None + + # load data + matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() + # set matrix variables + fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) + # and save it. + fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True) + + fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) + assert fh_test is not None + matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() + + # check distance load works as expected + instances, features = matrix.nonzero() + distances = np.absolute(instances - features) + # log.debug('max: {}'.format(np.max(distances))) + mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance + assert np.sum(mask) == 0 + + fh = MatrixFileHandler(pFileType='cool', pChrnameList=['1'], pMatrixFile=pMatrixFile) + assert fh is not None + + # load data + matrix2, _, _, _, _ = fh.load() + instances, features = matrix2.nonzero() + distances = np.absolute(instances - features) + mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance + assert np.sum(mask) > 0 + + # check if load and save matrix are equal + nt.assert_equal(matrix.data, matrix_test.data) + nt.assert_equal(cut_intervals, cut_intervals_test) + nt.assert_equal(nan_bins, nan_bins_test) + nt.assert_equal(distance_counts, distance_counts_test) + nt.assert_equal(correction_factors, correction_factors_test) + + os.unlink(cool_outfile) + + def test_load_h5_save_cool(): cool_outfile = outfile + '.cool' @@ -322,9 +392,9 @@ def test_save_cool_enforce_integer(): assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() - pMatrixFile = ROOT + 'Li_et_al_2015.h5' - fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) - assert fh is not None + # pMatrixFile = ROOT + 'Li_et_al_2015.h5' + # fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) + # assert fh is not None # load data # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() @@ -334,6 +404,10 @@ def test_save_cool_enforce_integer(): # instances_factors *= features_factors # matrix_applied_correction = matrix.data / instances_factors + # mask = matrix.data == 0 + matrix.data = np.rint(matrix.data) + matrix.eliminate_zeros() + # matrix_test.eliminate_zeros() nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0) nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) @@ -346,7 +420,7 @@ def test_save_cool_enforce_integer(): def test_load_cool_hic2cool_versions(): pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool' - hic2cool_042 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR') + hic2cool_042 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR', pCorrectionOperator='*') pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' hic2cool_051 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR') @@ -383,7 +457,7 @@ def test_save_cool_apply_division(): fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) - fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile, pCorrectionOperator='/') + fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() pMatrixFile = ROOT + 'Li_et_al_2015.cool' diff --git a/hicmatrix/utilities.py b/hicmatrix/utilities.py index b4b3e44..a5d1ba3 100644 --- a/hicmatrix/utilities.py +++ b/hicmatrix/utilities.py @@ -1,5 +1,6 @@ import numpy as np import sys +import gzip def toString(s): @@ -53,7 +54,7 @@ def check_chrom_str_bytes(pIteratableObj, pObj): def convertNansToZeros(ma): nan_elements = np.flatnonzero(np.isnan(ma.data)) if len(nan_elements) > 0: - ma.data[nan_elements] = 0 + ma.data[nan_elements] = 0.0 return ma @@ -98,3 +99,18 @@ def enlarge_bins(bin_intervals): bin_intervals[-1] = (chrom, start, end, extra) return bin_intervals + + +def opener(filename): + """ + Determines if a file is compressed or not + """ + f = open(filename, 'rb') + # print("gzip or not?", f.read(2)) + + if f.read(2) == b'\x1f\x8b': + f.seek(0) + return gzip.GzipFile(fileobj=f) + else: + f.seek(0) + return f diff --git a/requirements.txt b/requirements.txt index 414abff..53614ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,5 @@ numpy >= 1.16.* scipy >= 1.2.* pandas = 0.25.* pytables >= 3.5.* -future = 0.17.* -cooler = 0.8.5 +cooler >= 0.8.* intervaltree = 3.0.* diff --git a/setup.py b/setup.py index c2bdf6a..71347cd 100644 --- a/setup.py +++ b/setup.py @@ -96,16 +96,16 @@ def checkProgramIsInstalled(self, program, args, where_to_download, install_requires_py = ["numpy >= 1.16.*", "scipy >= 1.2.*", "tables >= 3.5.*", - "pandas >= 0.24.*", - "cooler == 0.8.5", + "pandas == 0.25.*", + "cooler >= 0.8.*", "intervaltree == 3.0.*" ] setup( name='HiCMatrix', version=get_version(), - author='Fidel Ramirez, Vivek Bhardwaj, Björn Grüning, Joachim Wolff', - author_email='deeptools@googlegroups.com', + author='Joachim Wolff, Leily Rabbani, Vivek Bhardwaj, Fidel Ramirez', + author_email='wolffj@informatik.uni-freiburg.de', packages=find_packages(), include_package_data=True,