Skip to content

Commit

Permalink
Merge pull request #55 from histogrammar/performance_updates
Browse files Browse the repository at this point in the history
Multiple performance updates, to Bin, SparselyBin and Categorize hists
  • Loading branch information
mbaak authored Jun 6, 2022
2 parents 8c82095 + 1e303db commit dcbf220
Show file tree
Hide file tree
Showing 9 changed files with 159 additions and 67 deletions.
8 changes: 8 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
Release notes
=============

Version 1.0.28, June 2022
-------------------------
* Multiple performance updates, to Bin, SparselyBin and Categorize histograms.
* SparselyBin, Categorize: optimized filling with 1-d and 2-d numpy arrays
* Bin, SparselyBin, Categorize: (fast) numpy arrays for bin-centers and bin-labels.
* Count: new, fast filling option when float weight is known.
* util.py: faster get_datatype() and get_ndim() functions.

Version 1.0.27, May 2022
------------------------
* Multiple performance updates, thanks to Simon Brugman.
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ PyCUDA is available, they can also be filled from Numpy arrays by JIT-compiling

This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.

Latest Python release: v1.0.27 (May 2022).
Latest Python release: v1.0.28 (June 2022).

Announcements
=============
Expand Down
12 changes: 9 additions & 3 deletions histogrammar/primitives/bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def __init__(self, num, low, high, quantity=identity, value=Count(),
self.values = [None] * num
self.contentType = "Count"
else:
self.values = [value.zero() for i in xrange(num)]
self.values = [value.zero() for i in range(num)]
self.contentType = value.name
self.underflow = underflow.copy()
self.overflow = overflow.copy()
Expand Down Expand Up @@ -955,6 +955,11 @@ def __hash__(self):
return hash((self.low, self.high, self.quantity, self.entries, tuple(
self.values), self.underflow, self.overflow, self.nanflow))

@property
def size(self):
"""Get number of bins, consistent with SparselyBin and Categorize """
return self.num

@property
def n_bins(self):
"""Get number of bins, consistent with SparselyBin and Categorize """
Expand Down Expand Up @@ -1107,7 +1112,8 @@ def bin_centers(self, low=None, high=None):
import numpy as np
# trivial case
if low is None and high is None:
return np.array([sum(self.range(i)) / 2.0 for i in self.indexes])
bw = self.bin_width()
return np.arange(self.low + bw / 2., self.high + bw / 2., bw)
# catch weird cases
elif low is not None and high is not None:
if low > high:
Expand All @@ -1131,7 +1137,7 @@ def bin_centers(self, low=None, high=None):
if np.isclose(high, self.low + self.bin_width() * maxBin):
maxBin -= 1

return np.array([sum(self.range(i)) / 2.0 for i in range(minBin, maxBin + 1)])
return self.low + (np.linspace(minBin, maxBin, maxBin - minBin + 1) + 0.5) * self.bin_width()

def _center_from_key(self, idx):
xc = (idx + 0.5) * self.bin_width() + self.low
Expand Down
60 changes: 42 additions & 18 deletions histogrammar/primitives/categorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,29 +283,51 @@ def _numpy(self, data, weights, shape):
if isinstance(q, (list, tuple)):
q = np.array(q)
self._checkNPQuantity(q, shape)

if isinstance(weights, (float, int)) and weights == 1:
all_weights_one = True
elif isinstance(weights, np.ndarray) and np.all(weights == 1):
all_weights_one = True
else:
all_weights_one = False
self._checkNPWeights(weights, shape)
weights = self._makeNPWeights(weights, shape)
newentries = weights.sum()

subweights = weights.copy()
subweights[weights < 0.0] = 0.0

selection = np.empty(q.shape, dtype=np.bool)
uniques, inverse = np.unique(q, return_inverse=True)

# no possibility of exception from here on out (for rollback)
for i, x in enumerate(uniques):
if isinstance(x, (basestring, bool)):
pass
elif x is None or np.isnan(x):
x = 'NaN'
if x not in self.bins:
self.bins[x] = self.value.zero()
if self.n_dim == 1 and all_weights_one and isinstance(self.value, Count):
# special case of filling single array where all weights are 1
uniques, counts = np.unique(q, return_counts=True)

for c, x in zip(counts, uniques):
if isinstance(x, (basestring, bool)):
pass
elif x is None or np.isnan(x):
x = 'NaN'
if x not in self.bins:
self.bins[x] = self.value.zero()
self.bins[x]._numpy(None, c, [None])
else:
# all other cases ...
selection = np.empty(q.shape, dtype=np.bool)
uniques, inverse = np.unique(q, return_inverse=True)

np.not_equal(inverse, i, selection)
subweights[:] = weights
subweights[selection] = 0.0
self.bins[x]._numpy(data, subweights, shape)
# no possibility of exception from here on out (for rollback)
for i, x in enumerate(uniques):
if isinstance(x, (basestring, bool)):
pass
elif x is None or np.isnan(x):
x = 'NaN'
if x not in self.bins:
self.bins[x] = self.value.zero()

# passing on the full array seems faster for one- AND multi-dim histograms
np.not_equal(inverse, i, selection)
subweights[:] = weights
subweights[selection] = 0.0
self.bins[x]._numpy(data, subweights, shape)

self.entries += float(newentries)

Expand Down Expand Up @@ -430,12 +452,14 @@ def bin_labels(self, max_length=-1):
"""
Returns bin labels
:param int max_length: maximum length of a label. Default if full length.
:param int max_length: maximum length of a label. Default is full length.
:returns: array of labels
:rtype: numpy.array
"""
labels = []
if max_length == -1:
return np.array(list(self.bins.keys()))

labels = []
for i, key in enumerate(self.bins.keys()):
try:
label = str(key)
Expand All @@ -444,7 +468,7 @@ def bin_labels(self, max_length=-1):
except BaseException:
label = 'bin_%d' % i
labels.append(label)
return np.asarray(labels)
return np.array(labels)

def bin_centers(self, max_length=-1):
"""
Expand Down
6 changes: 6 additions & 0 deletions histogrammar/primitives/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ def _numpy(self, _, weights, shape):
assert t.shape[0] == 1
self.entries += float(t[0])

elif isinstance(weights, (int, float, numpy.number)):
if self.transform is identity:
self.entries += float(weights)
else:
self.entries += self.transform(weights)

else:
raise ValueError("cannot use Numpy to fill an isolated Count (unless the weights are given as an array)")

Expand Down
119 changes: 84 additions & 35 deletions histogrammar/primitives/sparselybin.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import math
import numbers

Expand Down Expand Up @@ -259,7 +260,32 @@ def at(self, index):
@property
def indexes(self):
"""Get a sequence of filled indexes."""
return sorted(self.keys)
return sorted(self.bins.keys())

@property
def binsMap(self):
"""Input ``bins`` as a key-value map."""
return self.bins

@property
def size(self):
"""Number of ``bins``."""
return len(self.bins)

@property
def keys(self):
"""Iterable over the keys of the ``bins``."""
return self.bins.keys()

@property
def values(self):
"""Iterable over the values of the ``bins``."""
return list(self.bins.values())

@property
def keySet(self):
"""Set of keys among the ``bins``."""
return set(self.bins.keys())

def range(self, index):
"""Get the low and high edge of a bin (given by index number)."""
Expand Down Expand Up @@ -432,48 +458,76 @@ def _c99StructName(self):
def _numpy(self, data, weights, shape):
q = self.quantity(data)
self._checkNPQuantity(q, shape)

if isinstance(weights, (float, int)) and weights == 1:
all_weights_one = True
elif isinstance(weights, np.ndarray) and np.all(weights == 1):
all_weights_one = True
else:
all_weights_one = False
self._checkNPWeights(weights, shape)
weights = self._makeNPWeights(weights, shape)
newentries = weights.sum()

import numpy

selection = numpy.isnan(q)
numpy.bitwise_not(selection, selection)
selection = np.isnan(q)
np.bitwise_not(selection, selection) # invert selection
subweights = weights.copy()
subweights[selection] = 0.0
self.nanflow._numpy(data, subweights, shape)
subweights[:] = weights

# switch to float here like in bin.py else numpy throws
# TypeError on trivial integer cases such as:
# >>> q = numpy.array([1,2,3,4])
# >>> q = np.array([1,2,3,4])
# >>> np.divide(q,1,q)
# >>> np.floor(q,q)
q = numpy.array(q, dtype=numpy.float64)
neginfs = numpy.isneginf(q)
posinfs = numpy.isposinf(q)

numpy.subtract(q, self.origin, q)
numpy.divide(q, self.binWidth, q)
numpy.floor(q, q)
q = numpy.array(q, dtype=numpy.int64)
q = np.array(q, dtype=np.float64)
neginfs = np.isneginf(q)
posinfs = np.isposinf(q)

np.subtract(q, self.origin, q)
np.divide(q, self.binWidth, q)
np.floor(q, q)
q = np.array(q, dtype=np.int64)
q[neginfs] = LONG_MINUSINF
q[posinfs] = LONG_PLUSINF

selected = q[weights > 0.0]

selection = numpy.empty(q.shape, dtype=numpy.bool)
for index in numpy.unique(selected):
if index != LONG_NAN:
bin = self.bins.get(index)
if bin is None:
bin = self.value.zero()
self.bins[index] = bin

numpy.not_equal(q, index, selection)
subweights[:] = weights
subweights[selection] = 0.0
bin._numpy(data, subweights, shape)
# used below. bit expensive, so do here once
n_dim = self.n_dim

if n_dim == 1 and all_weights_one and isinstance(self.value, Count):
# special case: filling single array where all weights are 1
# (use fast np.unique that returns counts)
uniques, counts = np.unique(selected, return_counts=True)
for c, index in zip(counts, uniques):
if index != LONG_NAN:
bin = self.bins.get(index)
if bin is None:
bin = self.value.zero()
self.bins[index] = bin
# pass counts directly to Count object
self.bins[index]._numpy(None, c, [None])
else:
# all other cases ...
selection = np.empty(q.shape, dtype=np.bool)
for index in np.unique(selected):
if index != LONG_NAN:
bin = self.bins.get(index)
if bin is None:
bin = self.value.zero()
self.bins[index] = bin
if n_dim == 1:
# passing on the full array is faster for one-dim histograms
np.not_equal(q, index, selection)
subweights[:] = weights
subweights[selection] = 0.0
self.bins[index]._numpy(data, subweights, shape)
else:
# in practice passing on sliced arrays is faster for multi-dim histograms
np.equal(q, index, selection)
self.bins[index]._numpy(data[selection], subweights[selection], [np.sum(selection)])

# no possibility of exception from here on out (for rollback)
self.entries += float(newentries)
Expand Down Expand Up @@ -615,12 +669,12 @@ def __hash__(self):

@property
def n_bins(self):
"""Get number of bins, consistent with SparselyBin and Categorize """
return self.size
"""Get number of filled bins, consistent with SparselyBin and Categorize """
return len(self.bins)

def num_bins(self, low=None, high=None):
"""
Returns number of bins
Returns number of bins from low to high, including unfilled
Possible to set range with low and high params
Expand All @@ -629,7 +683,6 @@ def num_bins(self, low=None, high=None):
:returns: number of bins in range
:rtype: int
"""
import numpy as np
# sparse hist not filled
if self.minBin is None or self.maxBin is None:
return 0
Expand Down Expand Up @@ -672,7 +725,6 @@ def bin_edges(self, low=None, high=None):
:returns: numpy array with bin edges for selected range
:rtype: numpy.array
"""
import numpy as np
# sparse hist not filled
if self.minBin is None or self.maxBin is None:
return np.array([self.origin, self.origin + 1])
Expand Down Expand Up @@ -715,7 +767,6 @@ def bin_entries(self, low=None, high=None, xvalues=[]):
:returns: numpy array with numbers of entries for selected bins
:rtype: numpy.array
"""
import numpy as np
# sparse hist not filled
if self.minBin is None or self.maxBin is None:
return np.array([])
Expand Down Expand Up @@ -757,10 +808,8 @@ def bin_centers(self, low=None, high=None):
:returns: numpy array with bin centers for selected range
:rtype: numpy.array
"""
import numpy as np
bin_edges = self.bin_edges(low, high)
centers = [(bin_edges[i] + bin_edges[i + 1]) / 2. for i in range(len(bin_edges) - 1)]
return np.array(centers)
return (bin_edges[:-1] + bin_edges[1:]) / 2

@property
def mpv(self):
Expand Down
Loading

0 comments on commit dcbf220

Please sign in to comment.