From 7c291973bb83c6093acb583b55f7a1cb9f7c4019 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Wed, 1 May 2019 22:44:29 +0200 Subject: [PATCH 1/6] Adding automatic scaling if maximum value after correction is applied is smaller 1 or the magnitude of raw maximum and corrected maximum differs. --- hicmatrix/_version.py | 2 +- hicmatrix/lib/cool.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/hicmatrix/_version.py b/hicmatrix/_version.py index bcb0286..664b78e 100644 --- a/hicmatrix/_version.py +++ b/hicmatrix/_version.py @@ -1,2 +1,2 @@ -__version__ = '8' +__version__ = '10-dev' # Version number differs from HiCExplorer! diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index 41fd387..2b2d416 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -13,6 +13,7 @@ from past.builtins import zip from builtins import super from .matrixFile import MatrixFile +import math from hicmatrix.utilities import toString from hicmatrix.utilities import convertNansToOnes @@ -36,12 +37,15 @@ def __init__(self, pMatrixFile=None): self.hic2cool_version = None self.hicmatrix_version = None + self.scaleToOriginalRange = None def getInformationCoolerBinNames(self): return cooler.Cooler(self.matrixFileName).bins().columns.values def load(self): log.debug('Load in cool format') + self.minValue = None + self.maxValue = None if self.matrixFileName is None: log.info('No matrix is initialized') @@ -92,6 +96,9 @@ def load(self): # log.debug('cooler_file.info[\'nbins\'] {}'.format(type(cooler_file.info['nbins']))) matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype) + self.minValue = data.min() + self.maxValue = data.max() + # del data # del instances # del features @@ -175,6 +182,28 @@ def load(self): elif self.correctionOperator == '/': matrix.data /= instances_factors + # if self.scaleToOriginalRange is not None: + min_value = matrix.data.min() + max_value = matrix.data.max() + # check if max smaller one or if not same mangnitude + if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1): + desired_range_difference = self.maxValue - self.minValue + + min_value = matrix.data.min() + max_value = matrix.data.max() + + matrix.data = (matrix.data - min_value) + matrix.data /= (max_value - min_value) + matrix.data *= desired_range_difference + matrix.data += self.minValue + self.scaleToOriginalRange = True + # diff_scale_factor = matrix.data.max() / max_value + # if self.correctionOperator == '*': + # correction_factors *= diff_scale_factor + # if self.correctionOperator == '/': + # correction_factors /= diff_scale_factor + + cut_intervals = [] for values in cut_intervals_data_frame.values: @@ -266,6 +295,16 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): log.debug('self.correctionOperator: {}'.format(self.correctionOperator)) log.debug('self.fileWasH5: {}'.format(self.fileWasH5)) + if self.scaleToOriginalRange: + min_value = self.matrix.data.min() + max_value = self.matrix.data.max() + desired_range_difference = max_value - min_value + + self.matrix.data = (self.matrix.data - self.minValue) + self.matrix.data /= (self.maxValue - self.minValue) + self.matrix.data *= desired_range_difference + self.matrix.data += min_value + if self.correctionOperator == '*' or self.correctionOperator is None: self.matrix.data /= instances_factors elif self.correctionOperator == '/' or self.fileWasH5: From af5d4a7417d7752b433e286049c50cec91441104 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Wed, 1 May 2019 22:57:11 +0200 Subject: [PATCH 2/6] patch if only a region is loaded --- hicmatrix/lib/cool.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index 2b2d416..54cfc47 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -106,6 +106,8 @@ def load(self): if len(self.chrnameList) == 1: try: matrix = cooler_file.matrix(balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr() + self.minValue = matrix.data.min() + self.maxValue = matrix.data.max() except ValueError: exit("Wrong chromosome format. Please check UCSC / ensembl notation.") else: From 355e34108eb4fa3d96ba4c6f7d2546b9423bc586 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 2 May 2019 17:10:12 +0000 Subject: [PATCH 3/6] try to fix chrom loading bug --- hicmatrix/lib/cool.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index 54cfc47..204b287 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -15,7 +15,7 @@ from .matrixFile import MatrixFile import math -from hicmatrix.utilities import toString +from hicmatrix.utilities import toString, toBytes from hicmatrix.utilities import convertNansToOnes from hicmatrix._version import __version__ @@ -62,7 +62,7 @@ def load(self): log.info('The following file was tried to open: {}'.format(self.matrixFileName)) log.info("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0]))) exit() - + log.debug('self.chrnameList {}'.format(self.chrnameList)) if self.chrnameList is None: matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) used_dtype = np.int32 @@ -105,6 +105,7 @@ def load(self): else: if len(self.chrnameList) == 1: try: + # self.chrnameList[0] matrix = cooler_file.matrix(balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr() self.minValue = matrix.data.min() self.maxValue = matrix.data.max() @@ -390,8 +391,8 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): temp_dir=local_temp_dir) log.debug('info {}'.format(info)) - if self.appendData == 'w': - fileName = pFileName.split('::')[0] - with h5py.File(fileName, 'r+') as h5file: - h5file.attrs.update(info) - h5file.close() + # if self.appendData == 'w': + # fileName = pFileName.split('::')[0] + # with h5py.File(fileName, 'r+') as h5file: + # h5file.attrs.update(info) + # h5file.close() From 21da9a63db2fe5e265670992e94c920c3687c726 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Fri, 10 May 2019 19:25:45 +0000 Subject: [PATCH 4/6] Optimizing memory at load time and adding possibility to save correction factors and non-corrected data --- hicmatrix/HiCMatrix.py | 22 +++++++++++++++++++++- hicmatrix/lib/cool.py | 22 +++++++++++++++++----- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/hicmatrix/HiCMatrix.py b/hicmatrix/HiCMatrix.py index 80fc75f..1c25367 100644 --- a/hicmatrix/HiCMatrix.py +++ b/hicmatrix/HiCMatrix.py @@ -27,6 +27,7 @@ import tables from intervaltree import IntervalTree, Interval import cooler +import time from .utilities import toBytes from .utilities import toString @@ -57,29 +58,48 @@ def __init__(self, pMatrixFile=None, pChrnameList=None): self.orig_bin_ids = [] self.orig_cut_intervals = [] # similar to orig_bin_ids. Used to identify the position of masked nan bins self.matrixFileHandler = None - + start_time = time.time() if pMatrixFile is not None: log.debug('Load self.matrixFileHandler') fileType = 'cool' if pMatrixFile.endswith('.h5'): fileType = 'h5' self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList) + log.debug('init time: {}'.format(time.time() - start_time)) self.matrix, self.cut_intervals, self.nan_bins, \ self.correction_factors, self.distance_counts = self.matrixFileHandler.load() + log.debug('load time: {}'.format(time.time() - start_time)) + start_time = time.time() + log.debug('data loaded from file handler') if self.nan_bins is None: self.nan_bins = np.array([]) self.fillLowerTriangle() + log.debug('triangle time: {}'.format(time.time() - start_time)) + start_time = time.time() + + log.debug('fillLowerTriangle') self.restoreMaskedBins() + log.debug('restoreMaskedBins: {}'.format(time.time() - start_time)) + start_time = time.time() + + log.debug('restoreMaskedBins') + self.interval_trees, self.chrBinBoundaries = \ self.intervalListToIntervalTree(self.cut_intervals) + log.debug('intervalListToIntervalTree: {}'.format(time.time() - start_time)) + start_time = time.time() + + log.debug('intervalListToIntervalTree') + elif pMatrixFile is None: log.debug('Only init object, no matrix given.') else: log.error('matrix file not given') sys.exit(1) + log.debug('data loaded!') def save(self, pMatrixName, pSymmetric=True, pApplyCorrection=False, pHiCInfo=None): """ As an output format cooler and mcooler are supported. diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index 204b287..e50979a 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -14,6 +14,7 @@ from builtins import super from .matrixFile import MatrixFile import math +import time from hicmatrix.utilities import toString, toBytes from hicmatrix.utilities import convertNansToOnes @@ -88,6 +89,9 @@ def load(self): features[start_pos:start_pos + len(_features)] = _features start_pos += len(_features) i += size + del _data + del _instances + del _features # log.debug('max feature {}'.format(np.max(features))) # log.debug('max instance {}'.format(np.max(instances))) @@ -99,9 +103,9 @@ def load(self): self.minValue = data.min() self.maxValue = data.max() - # del data - # del instances - # del features + del data + del instances + del features else: if len(self.chrnameList) == 1: try: @@ -208,10 +212,13 @@ def load(self): cut_intervals = [] - + time_start = time.time() + log.debug('Creating cut_intervals {}'.format(time_start)) for values in cut_intervals_data_frame.values: cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0])) - + log.debug('Creating cut_intervals {} DONE'.format(time.time() - time_start)) + del cut_intervals_data_frame + del correction_factors_data_frame # try to restore nan_bins. try: shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1] @@ -318,6 +325,11 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): self.matrix.eliminate_zeros() + if self.correction_factors is not None and pApplyCorrection is False: + dtype_pixel['weight'] = np.float32 + weight = convertNansToOnes(np.array(self.correction_factors).flatten()) + bins_data_frame = bins_data_frame.assign(weight=weight) + instances, features = self.matrix.nonzero() matrix_data_frame = pd.DataFrame(instances, columns=['bin1_id'], dtype=np.int32) From 814fc49d76a1c12acb2087a65cde5a7d00573efd Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 27 Jun 2019 09:18:10 +0000 Subject: [PATCH 5/6] Linting, version number update to 10 --- hicmatrix/HiCMatrix.py | 2 +- hicmatrix/_version.py | 2 +- hicmatrix/lib/cool.py | 19 +++++++++---------- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/hicmatrix/HiCMatrix.py b/hicmatrix/HiCMatrix.py index 1c25367..7ba9b99 100644 --- a/hicmatrix/HiCMatrix.py +++ b/hicmatrix/HiCMatrix.py @@ -27,7 +27,7 @@ import tables from intervaltree import IntervalTree, Interval import cooler -import time +import time from .utilities import toBytes from .utilities import toString diff --git a/hicmatrix/_version.py b/hicmatrix/_version.py index 664b78e..fe7e4d3 100644 --- a/hicmatrix/_version.py +++ b/hicmatrix/_version.py @@ -1,2 +1,2 @@ -__version__ = '10-dev' +__version__ = '10' # Version number differs from HiCExplorer! diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index e50979a..b6add5f 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -193,9 +193,9 @@ def load(self): min_value = matrix.data.min() max_value = matrix.data.max() # check if max smaller one or if not same mangnitude - if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1): + if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1): desired_range_difference = self.maxValue - self.minValue - + min_value = matrix.data.min() max_value = matrix.data.max() @@ -204,13 +204,12 @@ def load(self): matrix.data *= desired_range_difference matrix.data += self.minValue self.scaleToOriginalRange = True - # diff_scale_factor = matrix.data.max() / max_value + # diff_scale_factor = matrix.data.max() / max_value # if self.correctionOperator == '*': # correction_factors *= diff_scale_factor # if self.correctionOperator == '/': # correction_factors /= diff_scale_factor - cut_intervals = [] time_start = time.time() log.debug('Creating cut_intervals {}'.format(time_start)) @@ -402,9 +401,9 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): metadata=self.hic_metadata, temp_dir=local_temp_dir) - log.debug('info {}'.format(info)) - # if self.appendData == 'w': - # fileName = pFileName.split('::')[0] - # with h5py.File(fileName, 'r+') as h5file: - # h5file.attrs.update(info) - # h5file.close() + # log.debug('info {}'.format(info)) + if self.appendData == 'w': + fileName = pFileName.split('::')[0] + with h5py.File(fileName, 'r+') as h5file: + h5file.attrs.update(info) + h5file.close() From 84d4863b281f1bdf709e1f633ecfcff3013fdd8a Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 27 Jun 2019 09:27:37 +0000 Subject: [PATCH 6/6] Updating dependencies --- requirements.txt | 14 +++++++------- setup.py | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/requirements.txt b/requirements.txt index d6028e2..52507a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -numpy >= 1.13.* -scipy >= 1.1.* -pandas >= 0.23.* -pytables >= 3.4.* -future = 0.16.* -cooler = 0.8.3 -intervaltree = 2.1.* +numpy >= 1.16.* +scipy >= 1.2.* +pandas >= 0.24.* +pytables >= 3.5.* +future = 0.17.* +cooler = 0.8.5 +intervaltree = 3.0.* diff --git a/setup.py b/setup.py index 08b41db..aa1cea7 100644 --- a/setup.py +++ b/setup.py @@ -93,13 +93,13 @@ def checkProgramIsInstalled(self, program, args, where_to_download, sys.stderr.write("Error: {}".format(e)) -install_requires_py = ["numpy >= 1.13.*", - "scipy >= 1.1.*", - "tables >= 3.4.*", - "pandas >= 0.23.*", - "future >= 0.16.*", - "cooler == 0.8.3", - "intervaltree == 2.1.*" +install_requires_py = ["numpy >= 1.16.*", + "scipy >= 1.2.*", + "tables >= 3.5.*", + "pandas >= 0.24.*", + "future >= 0.17.*", + "cooler == 0.8.5", + "intervaltree == 3.0.*" ] setup(