From 1e303db392edf5a7f313a9681ff0b6af078314c2 Mon Sep 17 00:00:00 2001
From: mbaak <maxbaak@gmail.com>
Date: Thu, 2 Jun 2022 17:14:21 +0200
Subject: [PATCH] Multiple performance updates, to Bin, SparselyBin and
 Categorize histograms.

* SparselyBin, Categorize: optimized filling with 1-d and 2-d numpy arrays
* Bin, SparselyBin, Categorize: (fast) numpy arrays for bin-centers and bin-labels.
* Count: new, fast filling option when float weight is known.
* util.py: faster get_datatype() and get_ndim() functions.
---
 CHANGES.rst                            |   8 ++
 README.rst                             |   2 +-
 histogrammar/primitives/bin.py         |  12 ++-
 histogrammar/primitives/categorize.py  |  60 +++++++++----
 histogrammar/primitives/count.py       |   6 ++
 histogrammar/primitives/sparselybin.py | 119 +++++++++++++++++--------
 histogrammar/util.py                   |  11 ++-
 histogrammar/version.py                |   6 +-
 setup.py                               |   2 +-
 9 files changed, 159 insertions(+), 67 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index a8dff3e..838575a 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -2,6 +2,14 @@
 Release notes
 =============
 
+Version 1.0.28, June 2022
+-------------------------
+* Multiple performance updates, to Bin, SparselyBin and Categorize histograms.
+* SparselyBin, Categorize: optimized filling with 1-d and 2-d numpy arrays
+* Bin, SparselyBin, Categorize: (fast) numpy arrays for bin-centers and bin-labels.
+* Count: new, fast filling option when float weight is known.
+* util.py: faster get_datatype() and get_ndim() functions.
+
 Version 1.0.27, May 2022
 ------------------------
 * Multiple performance updates, thanks to Simon Brugman.
diff --git a/README.rst b/README.rst
index aa28245..2e2aaab 100644
--- a/README.rst
+++ b/README.rst
@@ -20,7 +20,7 @@ PyCUDA is available, they can also be filled from Numpy arrays by JIT-compiling
 
 This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.
 
-Latest Python release: v1.0.27 (May 2022).
+Latest Python release: v1.0.28 (June 2022).
 
 Announcements
 =============
diff --git a/histogrammar/primitives/bin.py b/histogrammar/primitives/bin.py
index 6d137f8..dd3a285 100644
--- a/histogrammar/primitives/bin.py
+++ b/histogrammar/primitives/bin.py
@@ -148,7 +148,7 @@ def __init__(self, num, low, high, quantity=identity, value=Count(),
             self.values = [None] * num
             self.contentType = "Count"
         else:
-            self.values = [value.zero() for i in xrange(num)]
+            self.values = [value.zero() for i in range(num)]
             self.contentType = value.name
         self.underflow = underflow.copy()
         self.overflow = overflow.copy()
@@ -955,6 +955,11 @@ def __hash__(self):
         return hash((self.low, self.high, self.quantity, self.entries, tuple(
             self.values), self.underflow, self.overflow, self.nanflow))
 
+    @property
+    def size(self):
+        """Get number of bins, consistent with SparselyBin and Categorize """
+        return self.num
+
     @property
     def n_bins(self):
         """Get number of bins, consistent with SparselyBin and Categorize """
@@ -1107,7 +1112,8 @@ def bin_centers(self, low=None, high=None):
         import numpy as np
         # trivial case
         if low is None and high is None:
-            return np.array([sum(self.range(i)) / 2.0 for i in self.indexes])
+            bw = self.bin_width()
+            return np.arange(self.low + bw / 2., self.high + bw / 2., bw)
         # catch weird cases
         elif low is not None and high is not None:
             if low > high:
@@ -1131,7 +1137,7 @@ def bin_centers(self, low=None, high=None):
             if np.isclose(high, self.low + self.bin_width() * maxBin):
                 maxBin -= 1
 
-        return np.array([sum(self.range(i)) / 2.0 for i in range(minBin, maxBin + 1)])
+        return self.low + (np.linspace(minBin, maxBin, maxBin - minBin + 1) + 0.5) * self.bin_width()
 
     def _center_from_key(self, idx):
         xc = (idx + 0.5) * self.bin_width() + self.low
diff --git a/histogrammar/primitives/categorize.py b/histogrammar/primitives/categorize.py
index 02d816e..7810c92 100644
--- a/histogrammar/primitives/categorize.py
+++ b/histogrammar/primitives/categorize.py
@@ -283,6 +283,13 @@ def _numpy(self, data, weights, shape):
         if isinstance(q, (list, tuple)):
             q = np.array(q)
         self._checkNPQuantity(q, shape)
+
+        if isinstance(weights, (float, int)) and weights == 1:
+            all_weights_one = True
+        elif isinstance(weights, np.ndarray) and np.all(weights == 1):
+            all_weights_one = True
+        else:
+            all_weights_one = False
         self._checkNPWeights(weights, shape)
         weights = self._makeNPWeights(weights, shape)
         newentries = weights.sum()
@@ -290,22 +297,37 @@ def _numpy(self, data, weights, shape):
         subweights = weights.copy()
         subweights[weights < 0.0] = 0.0
 
-        selection = np.empty(q.shape, dtype=np.bool)
-        uniques, inverse = np.unique(q, return_inverse=True)
-
-        # no possibility of exception from here on out (for rollback)
-        for i, x in enumerate(uniques):
-            if isinstance(x, (basestring, bool)):
-                pass
-            elif x is None or np.isnan(x):
-                x = 'NaN'
-            if x not in self.bins:
-                self.bins[x] = self.value.zero()
+        if self.n_dim == 1 and all_weights_one and isinstance(self.value, Count):
+            # special case of filling single array where all weights are 1
+            uniques, counts = np.unique(q, return_counts=True)
+
+            for c, x in zip(counts, uniques):
+                if isinstance(x, (basestring, bool)):
+                    pass
+                elif x is None or np.isnan(x):
+                    x = 'NaN'
+                if x not in self.bins:
+                    self.bins[x] = self.value.zero()
+                self.bins[x]._numpy(None, c, [None])
+        else:
+            # all other cases ...
+            selection = np.empty(q.shape, dtype=np.bool)
+            uniques, inverse = np.unique(q, return_inverse=True)
 
-            np.not_equal(inverse, i, selection)
-            subweights[:] = weights
-            subweights[selection] = 0.0
-            self.bins[x]._numpy(data, subweights, shape)
+            # no possibility of exception from here on out (for rollback)
+            for i, x in enumerate(uniques):
+                if isinstance(x, (basestring, bool)):
+                    pass
+                elif x is None or np.isnan(x):
+                    x = 'NaN'
+                if x not in self.bins:
+                    self.bins[x] = self.value.zero()
+
+                # passing on the full array seems faster for one- AND multi-dim histograms
+                np.not_equal(inverse, i, selection)
+                subweights[:] = weights
+                subweights[selection] = 0.0
+                self.bins[x]._numpy(data, subweights, shape)
 
         self.entries += float(newentries)
 
@@ -430,12 +452,14 @@ def bin_labels(self, max_length=-1):
         """
         Returns bin labels
 
-        :param int max_length: maximum length of a label. Default if full length.
+        :param int max_length: maximum length of a label. Default is full length.
         :returns: array of labels
         :rtype: numpy.array
         """
-        labels = []
+        if max_length == -1:
+            return np.array(list(self.bins.keys()))
 
+        labels = []
         for i, key in enumerate(self.bins.keys()):
             try:
                 label = str(key)
@@ -444,7 +468,7 @@ def bin_labels(self, max_length=-1):
             except BaseException:
                 label = 'bin_%d' % i
             labels.append(label)
-        return np.asarray(labels)
+        return np.array(labels)
 
     def bin_centers(self, max_length=-1):
         """
diff --git a/histogrammar/primitives/count.py b/histogrammar/primitives/count.py
index 17dc442..37263d7 100644
--- a/histogrammar/primitives/count.py
+++ b/histogrammar/primitives/count.py
@@ -232,6 +232,12 @@ def _numpy(self, _, weights, shape):
                 assert t.shape[0] == 1
                 self.entries += float(t[0])
 
+        elif isinstance(weights, (int, float, numpy.number)):
+            if self.transform is identity:
+                self.entries += float(weights)
+            else:
+                self.entries += self.transform(weights)
+
         else:
             raise ValueError("cannot use Numpy to fill an isolated Count (unless the weights are given as an array)")
 
diff --git a/histogrammar/primitives/sparselybin.py b/histogrammar/primitives/sparselybin.py
index 6e9117a..a47f45e 100644
--- a/histogrammar/primitives/sparselybin.py
+++ b/histogrammar/primitives/sparselybin.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 import math
 import numbers
 
@@ -259,7 +260,32 @@ def at(self, index):
     @property
     def indexes(self):
         """Get a sequence of filled indexes."""
-        return sorted(self.keys)
+        return sorted(self.bins.keys())
+
+    @property
+    def binsMap(self):
+        """Input ``bins`` as a key-value map."""
+        return self.bins
+
+    @property
+    def size(self):
+        """Number of ``bins``."""
+        return len(self.bins)
+
+    @property
+    def keys(self):
+        """Iterable over the keys of the ``bins``."""
+        return self.bins.keys()
+
+    @property
+    def values(self):
+        """Iterable over the values of the ``bins``."""
+        return list(self.bins.values())
+
+    @property
+    def keySet(self):
+        """Set of keys among the ``bins``."""
+        return set(self.bins.keys())
 
     def range(self, index):
         """Get the low and high edge of a bin (given by index number)."""
@@ -432,48 +458,76 @@ def _c99StructName(self):
     def _numpy(self, data, weights, shape):
         q = self.quantity(data)
         self._checkNPQuantity(q, shape)
+
+        if isinstance(weights, (float, int)) and weights == 1:
+            all_weights_one = True
+        elif isinstance(weights, np.ndarray) and np.all(weights == 1):
+            all_weights_one = True
+        else:
+            all_weights_one = False
         self._checkNPWeights(weights, shape)
         weights = self._makeNPWeights(weights, shape)
         newentries = weights.sum()
 
-        import numpy
-
-        selection = numpy.isnan(q)
-        numpy.bitwise_not(selection, selection)
+        selection = np.isnan(q)
+        np.bitwise_not(selection, selection)  # invert selection
         subweights = weights.copy()
         subweights[selection] = 0.0
         self.nanflow._numpy(data, subweights, shape)
+        subweights[:] = weights
 
         # switch to float here like in bin.py else numpy throws
         # TypeError on trivial integer cases such as:
-        # >>> q = numpy.array([1,2,3,4])
+        # >>> q = np.array([1,2,3,4])
         # >>> np.divide(q,1,q)
         # >>> np.floor(q,q)
-        q = numpy.array(q, dtype=numpy.float64)
-        neginfs = numpy.isneginf(q)
-        posinfs = numpy.isposinf(q)
-
-        numpy.subtract(q, self.origin, q)
-        numpy.divide(q, self.binWidth, q)
-        numpy.floor(q, q)
-        q = numpy.array(q, dtype=numpy.int64)
+        q = np.array(q, dtype=np.float64)
+        neginfs = np.isneginf(q)
+        posinfs = np.isposinf(q)
+
+        np.subtract(q, self.origin, q)
+        np.divide(q, self.binWidth, q)
+        np.floor(q, q)
+        q = np.array(q, dtype=np.int64)
         q[neginfs] = LONG_MINUSINF
         q[posinfs] = LONG_PLUSINF
 
         selected = q[weights > 0.0]
 
-        selection = numpy.empty(q.shape, dtype=numpy.bool)
-        for index in numpy.unique(selected):
-            if index != LONG_NAN:
-                bin = self.bins.get(index)
-                if bin is None:
-                    bin = self.value.zero()
-                    self.bins[index] = bin
-
-                numpy.not_equal(q, index, selection)
-                subweights[:] = weights
-                subweights[selection] = 0.0
-                bin._numpy(data, subweights, shape)
+        # used below. bit expensive, so do here once
+        n_dim = self.n_dim
+
+        if n_dim == 1 and all_weights_one and isinstance(self.value, Count):
+            # special case: filling single array where all weights are 1
+            # (use fast np.unique that returns counts)
+            uniques, counts = np.unique(selected, return_counts=True)
+            for c, index in zip(counts, uniques):
+                if index != LONG_NAN:
+                    bin = self.bins.get(index)
+                    if bin is None:
+                        bin = self.value.zero()
+                        self.bins[index] = bin
+                    # pass counts directly to Count object
+                    self.bins[index]._numpy(None, c, [None])
+        else:
+            # all other cases ...
+            selection = np.empty(q.shape, dtype=np.bool)
+            for index in np.unique(selected):
+                if index != LONG_NAN:
+                    bin = self.bins.get(index)
+                    if bin is None:
+                        bin = self.value.zero()
+                        self.bins[index] = bin
+                    if n_dim == 1:
+                        # passing on the full array is faster for one-dim histograms
+                        np.not_equal(q, index, selection)
+                        subweights[:] = weights
+                        subweights[selection] = 0.0
+                        self.bins[index]._numpy(data, subweights, shape)
+                    else:
+                        # in practice passing on sliced arrays is faster for multi-dim histograms
+                        np.equal(q, index, selection)
+                        self.bins[index]._numpy(data[selection], subweights[selection], [np.sum(selection)])
 
         # no possibility of exception from here on out (for rollback)
         self.entries += float(newentries)
@@ -615,12 +669,12 @@ def __hash__(self):
 
     @property
     def n_bins(self):
-        """Get number of bins, consistent with SparselyBin and Categorize """
-        return self.size
+        """Get number of filled bins, consistent with SparselyBin and Categorize """
+        return len(self.bins)
 
     def num_bins(self, low=None, high=None):
         """
-        Returns number of bins
+        Returns number of bins from low to high, including unfilled
 
         Possible to set range with low and high params
 
@@ -629,7 +683,6 @@ def num_bins(self, low=None, high=None):
         :returns: number of bins in range
         :rtype: int
         """
-        import numpy as np
         # sparse hist not filled
         if self.minBin is None or self.maxBin is None:
             return 0
@@ -672,7 +725,6 @@ def bin_edges(self, low=None, high=None):
         :returns: numpy array with bin edges for selected range
         :rtype: numpy.array
         """
-        import numpy as np
         # sparse hist not filled
         if self.minBin is None or self.maxBin is None:
             return np.array([self.origin, self.origin + 1])
@@ -715,7 +767,6 @@ def bin_entries(self, low=None, high=None, xvalues=[]):
         :returns: numpy array with numbers of entries for selected bins
         :rtype: numpy.array
         """
-        import numpy as np
         # sparse hist not filled
         if self.minBin is None or self.maxBin is None:
             return np.array([])
@@ -757,10 +808,8 @@ def bin_centers(self, low=None, high=None):
         :returns: numpy array with bin centers for selected range
         :rtype: numpy.array
         """
-        import numpy as np
         bin_edges = self.bin_edges(low, high)
-        centers = [(bin_edges[i] + bin_edges[i + 1]) / 2. for i in range(len(bin_edges) - 1)]
-        return np.array(centers)
+        return (bin_edges[:-1] + bin_edges[1:]) / 2
 
     @property
     def mpv(self):
diff --git a/histogrammar/util.py b/histogrammar/util.py
index 7d21979..caae362 100644
--- a/histogrammar/util.py
+++ b/histogrammar/util.py
@@ -595,7 +595,7 @@ def get_datatype(hist, itr=0):
         # let's make an educated guess if it's a converted timestamp
         datatype = [np.number]
         if isinstance(hist, (histogrammar.Bin, histogrammar.SparselyBin)):
-            values = hist.bin_centers()
+            values = [hist.low, hist.high]
         elif isinstance(hist, histogrammar.CentrallyBin):
             values = hist.centers
         elif isinstance(hist, (histogrammar.IrregularlyBin, histogrammar.Stack)):
@@ -607,7 +607,7 @@ def get_datatype(hist, itr=0):
         #     values = []
         else:
             values = []
-        if len(values) > 0 and all([_is_probable_timestamp(v) for v in values]):
+        if len(values) > 0 and _is_probable_timestamp(values[0]) and _is_probable_timestamp(values[-1]):
             datatype = [np.datetime64]
 
     # Extract sub-hist and recurse
@@ -628,11 +628,9 @@ def _get_sub_hist(hist):
     if isinstance(hist, histogrammar.Categorize):
         sub_hist = hist.values[0] if hist.values else hist.value
     elif isinstance(hist, histogrammar.Bin):
-        entries = [h.entries for h in hist.values]
-        n_in_bins = sum(entries)
-        if n_in_bins > 0:
+        if hist.entries > 0:
             # pick first sub-hist found that is filled
-            idx = next(x[0] for x in enumerate(entries) if x[1] > 0)
+            idx = next(i for i, b in enumerate(hist.values) if b.entries > 0)
             sub_hist = hist.values[idx]
         else:
             sub_hist = hist.values[0] if hist.values else histogrammar.Count()
@@ -697,6 +695,7 @@ def datatype(self):  # noqa
             return type(None)
     return datatype
 
+
 def get_hist_props(hist):
     """Get histogram datatype properties.
 
diff --git a/histogrammar/version.py b/histogrammar/version.py
index 934e271..109643d 100644
--- a/histogrammar/version.py
+++ b/histogrammar/version.py
@@ -3,9 +3,9 @@
 import re
 
 name = "histogrammar"
-__version__ = "1.0.27"
-version = "1.0.27"
-full_version = "1.0.27"
+__version__ = "1.0.28"
+version = "1.0.28"
+full_version = "1.0.28"
 release = True
 
 version_info = tuple(re.split(r"[-\.]", __version__))
diff --git a/setup.py b/setup.py
index 30c30e9..53e6120 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@
 
 MAJOR = 1
 REVISION = 0
-PATCH = 27
+PATCH = 28
 DEV = False
 # NOTE: also update version at: README.rst