Skip to content

Commit

Permalink
Merge pull request #14 from deeptools/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
joachimwolff authored Jun 27, 2019
2 parents 5097284 + 84d4863 commit 59f3438
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 24 deletions.
22 changes: 21 additions & 1 deletion hicmatrix/HiCMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import tables
from intervaltree import IntervalTree, Interval
import cooler
import time

from .utilities import toBytes
from .utilities import toString
Expand Down Expand Up @@ -57,29 +58,48 @@ def __init__(self, pMatrixFile=None, pChrnameList=None):
self.orig_bin_ids = []
self.orig_cut_intervals = [] # similar to orig_bin_ids. Used to identify the position of masked nan bins
self.matrixFileHandler = None

start_time = time.time()
if pMatrixFile is not None:
log.debug('Load self.matrixFileHandler')
fileType = 'cool'
if pMatrixFile.endswith('.h5'):
fileType = 'h5'
self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList)
log.debug('init time: {}'.format(time.time() - start_time))
self.matrix, self.cut_intervals, self.nan_bins, \
self.correction_factors, self.distance_counts = self.matrixFileHandler.load()
log.debug('load time: {}'.format(time.time() - start_time))
start_time = time.time()

log.debug('data loaded from file handler')
if self.nan_bins is None:
self.nan_bins = np.array([])

self.fillLowerTriangle()
log.debug('triangle time: {}'.format(time.time() - start_time))
start_time = time.time()

log.debug('fillLowerTriangle')

self.restoreMaskedBins()
log.debug('restoreMaskedBins: {}'.format(time.time() - start_time))
start_time = time.time()

log.debug('restoreMaskedBins')

self.interval_trees, self.chrBinBoundaries = \
self.intervalListToIntervalTree(self.cut_intervals)
log.debug('intervalListToIntervalTree: {}'.format(time.time() - start_time))
start_time = time.time()

log.debug('intervalListToIntervalTree')

elif pMatrixFile is None:
log.debug('Only init object, no matrix given.')
else:
log.error('matrix file not given')
sys.exit(1)
log.debug('data loaded!')

def save(self, pMatrixName, pSymmetric=True, pApplyCorrection=False, pHiCInfo=None):
""" As an output format cooler and mcooler are supported.
Expand Down
2 changes: 1 addition & 1 deletion hicmatrix/_version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version__ = '9'
__version__ = '10'
# Version number differs from HiCExplorer!
69 changes: 61 additions & 8 deletions hicmatrix/lib/cool.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
from past.builtins import zip
from builtins import super
from .matrixFile import MatrixFile
import math
import time

from hicmatrix.utilities import toString
from hicmatrix.utilities import toString, toBytes
from hicmatrix.utilities import convertNansToOnes
from hicmatrix._version import __version__

Expand All @@ -36,12 +38,15 @@ def __init__(self, pMatrixFile=None):

self.hic2cool_version = None
self.hicmatrix_version = None
self.scaleToOriginalRange = None

def getInformationCoolerBinNames(self):
return cooler.Cooler(self.matrixFileName).bins().columns.values

def load(self):
log.debug('Load in cool format')
self.minValue = None
self.maxValue = None
if self.matrixFileName is None:
log.info('No matrix is initialized')

Expand All @@ -58,7 +63,7 @@ def load(self):
log.info('The following file was tried to open: {}'.format(self.matrixFileName))
log.info("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0])))
exit()

log.debug('self.chrnameList {}'.format(self.chrnameList))
if self.chrnameList is None:
matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True)
used_dtype = np.int32
Expand All @@ -84,6 +89,9 @@ def load(self):
features[start_pos:start_pos + len(_features)] = _features
start_pos += len(_features)
i += size
del _data
del _instances
del _features

# log.debug('max feature {}'.format(np.max(features)))
# log.debug('max instance {}'.format(np.max(instances)))
Expand All @@ -92,13 +100,19 @@ def load(self):
# log.debug('cooler_file.info[\'nbins\'] {}'.format(type(cooler_file.info['nbins'])))

matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype)
# del data
# del instances
# del features
self.minValue = data.min()
self.maxValue = data.max()

del data
del instances
del features
else:
if len(self.chrnameList) == 1:
try:
# self.chrnameList[0]
matrix = cooler_file.matrix(balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr()
self.minValue = matrix.data.min()
self.maxValue = matrix.data.max()
except ValueError:
exit("Wrong chromosome format. Please check UCSC / ensembl notation.")
else:
Expand Down Expand Up @@ -175,11 +189,35 @@ def load(self):
elif self.correctionOperator == '/':
matrix.data /= instances_factors

cut_intervals = []
# if self.scaleToOriginalRange is not None:
min_value = matrix.data.min()
max_value = matrix.data.max()
# check if max smaller one or if not same mangnitude
if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1):
desired_range_difference = self.maxValue - self.minValue

min_value = matrix.data.min()
max_value = matrix.data.max()

matrix.data = (matrix.data - min_value)
matrix.data /= (max_value - min_value)
matrix.data *= desired_range_difference
matrix.data += self.minValue
self.scaleToOriginalRange = True
# diff_scale_factor = matrix.data.max() / max_value
# if self.correctionOperator == '*':
# correction_factors *= diff_scale_factor
# if self.correctionOperator == '/':
# correction_factors /= diff_scale_factor

cut_intervals = []
time_start = time.time()
log.debug('Creating cut_intervals {}'.format(time_start))
for values in cut_intervals_data_frame.values:
cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0]))

log.debug('Creating cut_intervals {} DONE'.format(time.time() - time_start))
del cut_intervals_data_frame
del correction_factors_data_frame
# try to restore nan_bins.
try:
shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1]
Expand Down Expand Up @@ -266,6 +304,16 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
log.debug('self.correctionOperator: {}'.format(self.correctionOperator))
log.debug('self.fileWasH5: {}'.format(self.fileWasH5))

if self.scaleToOriginalRange:
min_value = self.matrix.data.min()
max_value = self.matrix.data.max()
desired_range_difference = max_value - min_value

self.matrix.data = (self.matrix.data - self.minValue)
self.matrix.data /= (self.maxValue - self.minValue)
self.matrix.data *= desired_range_difference
self.matrix.data += min_value

if self.correctionOperator == '*' or self.correctionOperator is None:
self.matrix.data /= instances_factors
elif self.correctionOperator == '/' or self.fileWasH5:
Expand All @@ -276,6 +324,11 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):

self.matrix.eliminate_zeros()

if self.correction_factors is not None and pApplyCorrection is False:
dtype_pixel['weight'] = np.float32
weight = convertNansToOnes(np.array(self.correction_factors).flatten())
bins_data_frame = bins_data_frame.assign(weight=weight)

instances, features = self.matrix.nonzero()

matrix_data_frame = pd.DataFrame(instances, columns=['bin1_id'], dtype=np.int32)
Expand Down Expand Up @@ -348,7 +401,7 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
metadata=self.hic_metadata,
temp_dir=local_temp_dir)

log.debug('info {}'.format(info))
# log.debug('info {}'.format(info))
if self.appendData == 'w':
fileName = pFileName.split('::')[0]
with h5py.File(fileName, 'r+') as h5file:
Expand Down
14 changes: 7 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
numpy >= 1.13.*
scipy >= 1.1.*
pandas >= 0.23.*
pytables >= 3.4.*
future = 0.16.*
cooler = 0.8.3
intervaltree = 2.1.*
numpy >= 1.16.*
scipy >= 1.2.*
pandas >= 0.24.*
pytables >= 3.5.*
future = 0.17.*
cooler = 0.8.5
intervaltree = 3.0.*
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,13 @@ def checkProgramIsInstalled(self, program, args, where_to_download,
sys.stderr.write("Error: {}".format(e))


install_requires_py = ["numpy >= 1.13.*",
"scipy >= 1.1.*",
"tables >= 3.4.*",
"pandas >= 0.23.*",
"future >= 0.16.*",
"cooler == 0.8.3",
"intervaltree == 2.1.*"
install_requires_py = ["numpy >= 1.16.*",
"scipy >= 1.2.*",
"tables >= 3.5.*",
"pandas >= 0.24.*",
"future >= 0.17.*",
"cooler == 0.8.5",
"intervaltree == 3.0.*"
]

setup(
Expand Down

0 comments on commit 59f3438

Please sign in to comment.