From 1e303db392edf5a7f313a9681ff0b6af078314c2 Mon Sep 17 00:00:00 2001 From: mbaak Date: Thu, 2 Jun 2022 17:14:21 +0200 Subject: [PATCH] Multiple performance updates, to Bin, SparselyBin and Categorize histograms. * SparselyBin, Categorize: optimized filling with 1-d and 2-d numpy arrays * Bin, SparselyBin, Categorize: (fast) numpy arrays for bin-centers and bin-labels. * Count: new, fast filling option when float weight is known. * util.py: faster get_datatype() and get_ndim() functions. --- CHANGES.rst | 8 ++ README.rst | 2 +- histogrammar/primitives/bin.py | 12 ++- histogrammar/primitives/categorize.py | 60 +++++++++---- histogrammar/primitives/count.py | 6 ++ histogrammar/primitives/sparselybin.py | 119 +++++++++++++++++-------- histogrammar/util.py | 11 ++- histogrammar/version.py | 6 +- setup.py | 2 +- 9 files changed, 159 insertions(+), 67 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index a8dff3e..838575a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,6 +2,14 @@ Release notes ============= +Version 1.0.28, June 2022 +------------------------- +* Multiple performance updates, to Bin, SparselyBin and Categorize histograms. +* SparselyBin, Categorize: optimized filling with 1-d and 2-d numpy arrays +* Bin, SparselyBin, Categorize: (fast) numpy arrays for bin-centers and bin-labels. +* Count: new, fast filling option when float weight is known. +* util.py: faster get_datatype() and get_ndim() functions. + Version 1.0.27, May 2022 ------------------------ * Multiple performance updates, thanks to Simon Brugman. diff --git a/README.rst b/README.rst index aa28245..2e2aaab 100644 --- a/README.rst +++ b/README.rst @@ -20,7 +20,7 @@ PyCUDA is available, they can also be filled from Numpy arrays by JIT-compiling This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation. -Latest Python release: v1.0.27 (May 2022). +Latest Python release: v1.0.28 (June 2022). Announcements ============= diff --git a/histogrammar/primitives/bin.py b/histogrammar/primitives/bin.py index 6d137f8..dd3a285 100644 --- a/histogrammar/primitives/bin.py +++ b/histogrammar/primitives/bin.py @@ -148,7 +148,7 @@ def __init__(self, num, low, high, quantity=identity, value=Count(), self.values = [None] * num self.contentType = "Count" else: - self.values = [value.zero() for i in xrange(num)] + self.values = [value.zero() for i in range(num)] self.contentType = value.name self.underflow = underflow.copy() self.overflow = overflow.copy() @@ -955,6 +955,11 @@ def __hash__(self): return hash((self.low, self.high, self.quantity, self.entries, tuple( self.values), self.underflow, self.overflow, self.nanflow)) + @property + def size(self): + """Get number of bins, consistent with SparselyBin and Categorize """ + return self.num + @property def n_bins(self): """Get number of bins, consistent with SparselyBin and Categorize """ @@ -1107,7 +1112,8 @@ def bin_centers(self, low=None, high=None): import numpy as np # trivial case if low is None and high is None: - return np.array([sum(self.range(i)) / 2.0 for i in self.indexes]) + bw = self.bin_width() + return np.arange(self.low + bw / 2., self.high + bw / 2., bw) # catch weird cases elif low is not None and high is not None: if low > high: @@ -1131,7 +1137,7 @@ def bin_centers(self, low=None, high=None): if np.isclose(high, self.low + self.bin_width() * maxBin): maxBin -= 1 - return np.array([sum(self.range(i)) / 2.0 for i in range(minBin, maxBin + 1)]) + return self.low + (np.linspace(minBin, maxBin, maxBin - minBin + 1) + 0.5) * self.bin_width() def _center_from_key(self, idx): xc = (idx + 0.5) * self.bin_width() + self.low diff --git a/histogrammar/primitives/categorize.py b/histogrammar/primitives/categorize.py index 02d816e..7810c92 100644 --- a/histogrammar/primitives/categorize.py +++ b/histogrammar/primitives/categorize.py @@ -283,6 +283,13 @@ def _numpy(self, data, weights, shape): if isinstance(q, (list, tuple)): q = np.array(q) self._checkNPQuantity(q, shape) + + if isinstance(weights, (float, int)) and weights == 1: + all_weights_one = True + elif isinstance(weights, np.ndarray) and np.all(weights == 1): + all_weights_one = True + else: + all_weights_one = False self._checkNPWeights(weights, shape) weights = self._makeNPWeights(weights, shape) newentries = weights.sum() @@ -290,22 +297,37 @@ def _numpy(self, data, weights, shape): subweights = weights.copy() subweights[weights < 0.0] = 0.0 - selection = np.empty(q.shape, dtype=np.bool) - uniques, inverse = np.unique(q, return_inverse=True) - - # no possibility of exception from here on out (for rollback) - for i, x in enumerate(uniques): - if isinstance(x, (basestring, bool)): - pass - elif x is None or np.isnan(x): - x = 'NaN' - if x not in self.bins: - self.bins[x] = self.value.zero() + if self.n_dim == 1 and all_weights_one and isinstance(self.value, Count): + # special case of filling single array where all weights are 1 + uniques, counts = np.unique(q, return_counts=True) + + for c, x in zip(counts, uniques): + if isinstance(x, (basestring, bool)): + pass + elif x is None or np.isnan(x): + x = 'NaN' + if x not in self.bins: + self.bins[x] = self.value.zero() + self.bins[x]._numpy(None, c, [None]) + else: + # all other cases ... + selection = np.empty(q.shape, dtype=np.bool) + uniques, inverse = np.unique(q, return_inverse=True) - np.not_equal(inverse, i, selection) - subweights[:] = weights - subweights[selection] = 0.0 - self.bins[x]._numpy(data, subweights, shape) + # no possibility of exception from here on out (for rollback) + for i, x in enumerate(uniques): + if isinstance(x, (basestring, bool)): + pass + elif x is None or np.isnan(x): + x = 'NaN' + if x not in self.bins: + self.bins[x] = self.value.zero() + + # passing on the full array seems faster for one- AND multi-dim histograms + np.not_equal(inverse, i, selection) + subweights[:] = weights + subweights[selection] = 0.0 + self.bins[x]._numpy(data, subweights, shape) self.entries += float(newentries) @@ -430,12 +452,14 @@ def bin_labels(self, max_length=-1): """ Returns bin labels - :param int max_length: maximum length of a label. Default if full length. + :param int max_length: maximum length of a label. Default is full length. :returns: array of labels :rtype: numpy.array """ - labels = [] + if max_length == -1: + return np.array(list(self.bins.keys())) + labels = [] for i, key in enumerate(self.bins.keys()): try: label = str(key) @@ -444,7 +468,7 @@ def bin_labels(self, max_length=-1): except BaseException: label = 'bin_%d' % i labels.append(label) - return np.asarray(labels) + return np.array(labels) def bin_centers(self, max_length=-1): """ diff --git a/histogrammar/primitives/count.py b/histogrammar/primitives/count.py index 17dc442..37263d7 100644 --- a/histogrammar/primitives/count.py +++ b/histogrammar/primitives/count.py @@ -232,6 +232,12 @@ def _numpy(self, _, weights, shape): assert t.shape[0] == 1 self.entries += float(t[0]) + elif isinstance(weights, (int, float, numpy.number)): + if self.transform is identity: + self.entries += float(weights) + else: + self.entries += self.transform(weights) + else: raise ValueError("cannot use Numpy to fill an isolated Count (unless the weights are given as an array)") diff --git a/histogrammar/primitives/sparselybin.py b/histogrammar/primitives/sparselybin.py index 6e9117a..a47f45e 100644 --- a/histogrammar/primitives/sparselybin.py +++ b/histogrammar/primitives/sparselybin.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import math import numbers @@ -259,7 +260,32 @@ def at(self, index): @property def indexes(self): """Get a sequence of filled indexes.""" - return sorted(self.keys) + return sorted(self.bins.keys()) + + @property + def binsMap(self): + """Input ``bins`` as a key-value map.""" + return self.bins + + @property + def size(self): + """Number of ``bins``.""" + return len(self.bins) + + @property + def keys(self): + """Iterable over the keys of the ``bins``.""" + return self.bins.keys() + + @property + def values(self): + """Iterable over the values of the ``bins``.""" + return list(self.bins.values()) + + @property + def keySet(self): + """Set of keys among the ``bins``.""" + return set(self.bins.keys()) def range(self, index): """Get the low and high edge of a bin (given by index number).""" @@ -432,48 +458,76 @@ def _c99StructName(self): def _numpy(self, data, weights, shape): q = self.quantity(data) self._checkNPQuantity(q, shape) + + if isinstance(weights, (float, int)) and weights == 1: + all_weights_one = True + elif isinstance(weights, np.ndarray) and np.all(weights == 1): + all_weights_one = True + else: + all_weights_one = False self._checkNPWeights(weights, shape) weights = self._makeNPWeights(weights, shape) newentries = weights.sum() - import numpy - - selection = numpy.isnan(q) - numpy.bitwise_not(selection, selection) + selection = np.isnan(q) + np.bitwise_not(selection, selection) # invert selection subweights = weights.copy() subweights[selection] = 0.0 self.nanflow._numpy(data, subweights, shape) + subweights[:] = weights # switch to float here like in bin.py else numpy throws # TypeError on trivial integer cases such as: - # >>> q = numpy.array([1,2,3,4]) + # >>> q = np.array([1,2,3,4]) # >>> np.divide(q,1,q) # >>> np.floor(q,q) - q = numpy.array(q, dtype=numpy.float64) - neginfs = numpy.isneginf(q) - posinfs = numpy.isposinf(q) - - numpy.subtract(q, self.origin, q) - numpy.divide(q, self.binWidth, q) - numpy.floor(q, q) - q = numpy.array(q, dtype=numpy.int64) + q = np.array(q, dtype=np.float64) + neginfs = np.isneginf(q) + posinfs = np.isposinf(q) + + np.subtract(q, self.origin, q) + np.divide(q, self.binWidth, q) + np.floor(q, q) + q = np.array(q, dtype=np.int64) q[neginfs] = LONG_MINUSINF q[posinfs] = LONG_PLUSINF selected = q[weights > 0.0] - selection = numpy.empty(q.shape, dtype=numpy.bool) - for index in numpy.unique(selected): - if index != LONG_NAN: - bin = self.bins.get(index) - if bin is None: - bin = self.value.zero() - self.bins[index] = bin - - numpy.not_equal(q, index, selection) - subweights[:] = weights - subweights[selection] = 0.0 - bin._numpy(data, subweights, shape) + # used below. bit expensive, so do here once + n_dim = self.n_dim + + if n_dim == 1 and all_weights_one and isinstance(self.value, Count): + # special case: filling single array where all weights are 1 + # (use fast np.unique that returns counts) + uniques, counts = np.unique(selected, return_counts=True) + for c, index in zip(counts, uniques): + if index != LONG_NAN: + bin = self.bins.get(index) + if bin is None: + bin = self.value.zero() + self.bins[index] = bin + # pass counts directly to Count object + self.bins[index]._numpy(None, c, [None]) + else: + # all other cases ... + selection = np.empty(q.shape, dtype=np.bool) + for index in np.unique(selected): + if index != LONG_NAN: + bin = self.bins.get(index) + if bin is None: + bin = self.value.zero() + self.bins[index] = bin + if n_dim == 1: + # passing on the full array is faster for one-dim histograms + np.not_equal(q, index, selection) + subweights[:] = weights + subweights[selection] = 0.0 + self.bins[index]._numpy(data, subweights, shape) + else: + # in practice passing on sliced arrays is faster for multi-dim histograms + np.equal(q, index, selection) + self.bins[index]._numpy(data[selection], subweights[selection], [np.sum(selection)]) # no possibility of exception from here on out (for rollback) self.entries += float(newentries) @@ -615,12 +669,12 @@ def __hash__(self): @property def n_bins(self): - """Get number of bins, consistent with SparselyBin and Categorize """ - return self.size + """Get number of filled bins, consistent with SparselyBin and Categorize """ + return len(self.bins) def num_bins(self, low=None, high=None): """ - Returns number of bins + Returns number of bins from low to high, including unfilled Possible to set range with low and high params @@ -629,7 +683,6 @@ def num_bins(self, low=None, high=None): :returns: number of bins in range :rtype: int """ - import numpy as np # sparse hist not filled if self.minBin is None or self.maxBin is None: return 0 @@ -672,7 +725,6 @@ def bin_edges(self, low=None, high=None): :returns: numpy array with bin edges for selected range :rtype: numpy.array """ - import numpy as np # sparse hist not filled if self.minBin is None or self.maxBin is None: return np.array([self.origin, self.origin + 1]) @@ -715,7 +767,6 @@ def bin_entries(self, low=None, high=None, xvalues=[]): :returns: numpy array with numbers of entries for selected bins :rtype: numpy.array """ - import numpy as np # sparse hist not filled if self.minBin is None or self.maxBin is None: return np.array([]) @@ -757,10 +808,8 @@ def bin_centers(self, low=None, high=None): :returns: numpy array with bin centers for selected range :rtype: numpy.array """ - import numpy as np bin_edges = self.bin_edges(low, high) - centers = [(bin_edges[i] + bin_edges[i + 1]) / 2. for i in range(len(bin_edges) - 1)] - return np.array(centers) + return (bin_edges[:-1] + bin_edges[1:]) / 2 @property def mpv(self): diff --git a/histogrammar/util.py b/histogrammar/util.py index 7d21979..caae362 100644 --- a/histogrammar/util.py +++ b/histogrammar/util.py @@ -595,7 +595,7 @@ def get_datatype(hist, itr=0): # let's make an educated guess if it's a converted timestamp datatype = [np.number] if isinstance(hist, (histogrammar.Bin, histogrammar.SparselyBin)): - values = hist.bin_centers() + values = [hist.low, hist.high] elif isinstance(hist, histogrammar.CentrallyBin): values = hist.centers elif isinstance(hist, (histogrammar.IrregularlyBin, histogrammar.Stack)): @@ -607,7 +607,7 @@ def get_datatype(hist, itr=0): # values = [] else: values = [] - if len(values) > 0 and all([_is_probable_timestamp(v) for v in values]): + if len(values) > 0 and _is_probable_timestamp(values[0]) and _is_probable_timestamp(values[-1]): datatype = [np.datetime64] # Extract sub-hist and recurse @@ -628,11 +628,9 @@ def _get_sub_hist(hist): if isinstance(hist, histogrammar.Categorize): sub_hist = hist.values[0] if hist.values else hist.value elif isinstance(hist, histogrammar.Bin): - entries = [h.entries for h in hist.values] - n_in_bins = sum(entries) - if n_in_bins > 0: + if hist.entries > 0: # pick first sub-hist found that is filled - idx = next(x[0] for x in enumerate(entries) if x[1] > 0) + idx = next(i for i, b in enumerate(hist.values) if b.entries > 0) sub_hist = hist.values[idx] else: sub_hist = hist.values[0] if hist.values else histogrammar.Count() @@ -697,6 +695,7 @@ def datatype(self): # noqa return type(None) return datatype + def get_hist_props(hist): """Get histogram datatype properties. diff --git a/histogrammar/version.py b/histogrammar/version.py index 934e271..109643d 100644 --- a/histogrammar/version.py +++ b/histogrammar/version.py @@ -3,9 +3,9 @@ import re name = "histogrammar" -__version__ = "1.0.27" -version = "1.0.27" -full_version = "1.0.27" +__version__ = "1.0.28" +version = "1.0.28" +full_version = "1.0.28" release = True version_info = tuple(re.split(r"[-\.]", __version__)) diff --git a/setup.py b/setup.py index 30c30e9..53e6120 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ MAJOR = 1 REVISION = 0 -PATCH = 27 +PATCH = 28 DEV = False # NOTE: also update version at: README.rst