Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/dei 105 feedback on multiple classification rule #103

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
af836d0
First step in implementing the validation of the parser_classificatio…
CindyvdVries May 2, 2024
09c1016
Update parser_classification_rule.py
CindyvdVries May 2, 2024
df3d1df
Merge branch 'feature/DEI-189-make-all-comparison-operators-available…
CindyvdVries May 7, 2024
0e9cbf7
Adding all logic to check operatiors
CindyvdVries May 7, 2024
4b9c6d2
Adding tests to check if all possibilities are covered
CindyvdVries May 7, 2024
9fd90fa
implement logic for all comparisons operators
CindyvdVries May 10, 2024
fac2250
round up logic for feedback on classification rule per parameter
CindyvdVries May 10, 2024
935b641
complete implementation for multiple variables + tests
CindyvdVries May 10, 2024
ded31e1
Merge branch 'main' into feature/DEI-105-feedback-on-multiple-classif…
CindyvdVries May 10, 2024
ccc84c9
last fixes, comments and cut off at 10 lines
CindyvdVries May 12, 2024
3974ef3
add tests
CindyvdVries May 12, 2024
9188248
correct test for parsing create classification rule. Ranges in tests …
mKlapwijk May 13, 2024
1690195
fix flake inspection error for comparing to none
mKlapwijk May 13, 2024
762c74a
fix flake errors on line lenght
mKlapwijk May 13, 2024
edb7292
add spaces to messages
mKlapwijk May 13, 2024
4c5efd5
correct typo in rule name
mKlapwijk May 13, 2024
c1693a0
fix tests
CindyvdVries May 14, 2024
b01ca65
change to a seperate log file. Add some spaces in messages
CindyvdVries May 14, 2024
3094f27
Feature/dei 105 simplified criteria validation (#104)
CindyvdVries May 17, 2024
bc52005
flake8..
CindyvdVries May 17, 2024
a0f3d12
Merge branch 'main' into feature/DEI-105-feedback-on-multiple-classif…
CindyvdVries May 17, 2024
2990d75
Update from feedback comments and remove print statements
CindyvdVries May 22, 2024
6e558c0
fix tests
CindyvdVries May 22, 2024
4a43198
Merge branch 'main' into feature/DEI-105-feedback-on-multiple-classif…
mKlapwijk May 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,4 @@ dmypy.json
!examples/input_yaml_files/*.yaml
/tests_acceptance/output_nc_files/*.nc
.idea/*
classification_warnings.txt
12 changes: 6 additions & 6 deletions decoimpact/business/entities/rules/string_parser_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,10 @@ def type_of_classification(class_val) -> str:
if "<" in class_val:
read_str_comparison(class_val, "<")
return "smaller"
try:
float(class_val)
return "number"
except ValueError as exc:
raise ValueError(f"No valid criteria is given: {class_val}") from exc
print(class_val, float(class_val))

raise ValueError(f"No valid criteria is given: {class_val}")
try:
float(class_val)
return "number"
except Exception as exc:
raise ValueError(f"No valid criteria is given: {class_val}") from exc
279 changes: 278 additions & 1 deletion decoimpact/data/parsers/parser_classification_rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
"""
from typing import Any, Dict

import numpy as _np

from decoimpact.business.entities.rules.string_parser_utils import (
read_str_comparison,
str_range_to_list,
type_of_classification,
)
from decoimpact.crosscutting.i_logger import ILogger
from decoimpact.data.api.i_rule_data import IRuleData
from decoimpact.data.dictionary_utils import convert_table_element, get_dict_element
Expand All @@ -21,7 +28,6 @@


class ParserClassificationRule(IParserRuleBase):

"""Class for creating a ClassificationRuleData"""

@property
Expand All @@ -43,6 +49,7 @@ def parse_dict(self, dictionary: Dict[str, Any], logger: ILogger) -> IRuleData:
criteria_table = convert_table_element(criteria_table_list)

validate_table_with_input(criteria_table, input_variable_names)
self._validate_table_coverage(criteria_table, logger)

output_variable_name = get_dict_element("output_variable", dictionary)
description = get_dict_element("description", dictionary)
Expand All @@ -54,3 +61,273 @@ def parse_dict(self, dictionary: Dict[str, Any], logger: ILogger) -> IRuleData:
output_variable_name,
description,
)

def _validate_table_coverage(self, crit_table, logger: ILogger):
"""Check if the criteria for the parameters given in the criteria_table
cover the entire range of data values. If not give the user feedback (warnings)
concerning gaps and overlaps.

Args:
crit_table (_type_): User input describing criteria per parameter
"""
msgs = []
mKlapwijk marked this conversation as resolved.
Show resolved Hide resolved
criteria_table = crit_table.copy()
del criteria_table["output"]

def divide_table_in_unique_chunks(criteria_table, conditions={}, unique=True):
"""This is a recursive function until all combinations of variables in the
criteria table is checked on coverage.

Args:
criteria_table (_type_): _description_
conditions (dict, optional): _description_. Defaults to {}.
unique (bool, optional): _description_. Defaults to True.
"""
#
# If there is only one variable, check on all conditions for coverage
if len(criteria_table.items()) == 1:
cond_str = ", ".join(
[f"{key}: {value}" for key, value in conditions.items()]
)
name, criteria = next(iter(criteria_table.items()))
if cond_str != "":
# When checking a single parameter or the first parameter
cond_str = f"For conditions: ({cond_str}). "
if unique:
# Little trick to ignore the duplicates when a combination of
# variables is given. This step is skipped when there is
# only one parameter given in the criteria_table
criteria = _np.unique(criteria)
# WHen there is only one parameter left in the given table ()
self._validate_criteria_on_overlap_and_gaps(
name, criteria, msgs, cond_str, logger
)
# Else evaluate the previous variables to get unique combinations back
else:
# This recursive function loops over all variables and filters it on
# unique values
crit_to_sort = list(criteria_table.values())[0]
for unique_c in _np.unique(crit_to_sort):
indices = [i for i, c in enumerate(crit_to_sort) if c == unique_c]

# Make a new criteria_table with the remaining variables
new_crit_table = dict(
(k, _np.array(v)[indices])
for i, (k, v) in enumerate(criteria_table.items())
if i != 0
)
conditions[list(criteria_table.keys())[0]] = unique_c
# Send the remaining filtered parameters back into the function
divide_table_in_unique_chunks(new_crit_table, conditions)

new_crit_table = criteria_table.copy()
unique = True

# If only 1 parameter is given in the criteria_table check the first parameter
# on all values and not only the unique values.
if len(new_crit_table.items()) == 1:
unique = False

# Make a loop over all variables from right to left to check combinations
for key in reversed(criteria_table.keys()):
divide_table_in_unique_chunks(new_crit_table, {}, unique)
del new_crit_table[key]

max_msg = 6
if len(msgs) < max_msg:
logger.log_warning("\n".join(msgs))
else:
logger.log_warning("\n".join(msgs[:max_msg]))
logger.log_warning(
f"{len(msgs)} warnings found concerning coverage of the "
f"parameters. Only first {max_msg} warnings are shown. See "
"classification_warnings.log file for all warnings."
)
f = open("classification_warnings.log", "w")
f.write("\n".join(msgs))
f.close()
# Only show the first 2 lines. Print all msgs to a txt file.

def _convert_to_ranges(self, val):
"""Make sure all type of accepted criteria is converted to range format
[start, end]

Args:
val: Criteria to be converted (number, range or comparison)

Returns:
[start, end]: Returns a range for the criteria given.
number -> [val, val]
comparison -> [-inf, val] or [val, inf]
range -> val
"""

for bnd_name, operator in [("larger_equal", ">="), ("larger", ">")]:
if type_of_classification(val) == bnd_name:
return [read_str_comparison(val, operator), float("inf"), bnd_name]

for bnd_name, operator in [("smaller_equal", "<="), ("smaller", "<")]:
if type_of_classification(val) == bnd_name:
return [float("-inf"), read_str_comparison(val, operator), bnd_name]

if type_of_classification(val) == "number":
return [float(val), float(val), "equal"]

elif type_of_classification(val) == "range":
start_range, end_range = str_range_to_list(val)
return [start_range, end_range, "equal"]

else:
return [float("-inf"), float("inf")]

def _validate_criteria_on_overlap_and_gaps(
mKlapwijk marked this conversation as resolved.
Show resolved Hide resolved
self, name, criteria, msgs, pre_warn, logger: ILogger
):
"""Go over the given criteria to determine if there are gaps or
overlaps.

Args:
name (_type_): Name of the parameter
criteria (_type_): The criteria (ranges, numbers of comparisons)
msgs (_type_): A list with all gathered warning messages
pre_warn (_type_): A prepend message that needs to be included
for parameter combinations

Returns:
_type_: _description_
"""
# The list of criteria is converted to a list of ranges
range_criteria = list(map(self._convert_to_ranges, criteria))

# The ranges needs to be sorted. First on "end" value (1.)
# then on "start" value (2.)
# For example: [[1, 4], [0, 5], [-inf, 2] [-inf, 0]]
# 1. [[-inf, 0], [-inf, 2], [1, 4], [0, 5]]
# 2. [[-inf, 0], [-inf, 2], [0, 5], [1, 4]]
sorted_range_criteria = sorted(range_criteria, key=lambda x: x[1])
sorted_range_criteria = sorted(sorted_range_criteria, key=lambda x: x[0])
mKlapwijk marked this conversation as resolved.
Show resolved Hide resolved

# Check if there are multiple larger or larger and equal comparison values are
mKlapwijk marked this conversation as resolved.
Show resolved Hide resolved
# present, this will cause overlap
smaller = [
i
for i, c in enumerate(sorted_range_criteria)
if (c[0] == float("-inf")) & (c[1] != float("inf"))
]
if len(smaller) > 1:
msgs.append(
f"{pre_warn}Overlap for variable {name}, multiple criteria with "
"operators < or <= are defined"
)
for i in reversed(smaller[:-1]):
del sorted_range_criteria[i]

# Check if there are multiple larger or larger and equal comparison values are
# present, this will cause overlap
larger = [
i
for i, c in enumerate(sorted_range_criteria)
if (c[1] == float("inf")) & (c[0] != float("-inf"))
]
if len(larger) > 1:
msgs.append(
f"{pre_warn}Overlap for variable {name}, multiple criteria with "
"operators > or >= are defined"
)
for i in larger[1:]:
del sorted_range_criteria[i]

for c_ind, crit in enumerate(sorted_range_criteria):
if c_ind == 0:
if crit[0] != float("-inf"):
msgs = self._warn_message(
name, msgs, pre_warn, float("-inf"), crit[0], "Gap"
)

else:
prev_c = sorted_range_criteria[c_ind - 1]

begin_inside = self._check_inside_bounds(
prev_c[0], prev_c[1], crit[0], op_prev=prev_c[-1], op_cur=crit[-1]
)
end_inside = self._check_inside_bounds(
prev_c[0], prev_c[1], crit[1], op_prev=prev_c[-1], op_cur=crit[-1]
)

# Exception is needed for when a > or < operator is defined. No overlap
# is defined but also not a gap, so begin_inside and end_inside cover
# these exceptions properly
non_equal_overlap = not (
(("equal" in str(crit[-1])) ^ ("equal" in str(prev_c[-1])))
& (crit[0] == prev_c[1])
)

# The range is inside the previous range eg when the user
# gives the criteria: 0:10 and 3:5, giving one overlap.
if begin_inside & end_inside:
msgs = self._warn_message(
name,
msgs,
pre_warn,
crit[0],
crit[1],
)
crit[1] = prev_c[1]

# The range starts within the previous range eg when the user
# gives the criteria: 0:10 and 3:15, an overlap will occur
elif begin_inside & (not end_inside) & (non_equal_overlap):
msgs = self._warn_message(
name,
msgs,
pre_warn,
crit[0],
prev_c[1],
)

# Because the list is sorted it can never occur that (not
# "begin_inside) & end_inside" happens

# The range is completely outside the previous range eg when the user
# gives the criteria: 0:10 and 15:20, a gap will occur
elif (not begin_inside) & (not end_inside) & (non_equal_overlap):
msgs = self._warn_message(
name, msgs, pre_warn, prev_c[1], crit[0], "Gap"
)

if sorted_range_criteria[-1][1] != float("inf"):
msgs = self._warn_message(
name,
msgs,
pre_warn,
max([list_c[1] for list_c in sorted_range_criteria]),
float("inf"),
"Gap",
)

# Create the final check over the not_covered_values and the covered_numbers
# Send warning with the combined messages

return msgs

def _check_inside_bounds(self, start, end, var, op_prev=None, op_cur=None):
# Check wether the next value falls wihtin the bounds of the previous range.
# Some exceptions on > and < defined values.
if op_cur == "larger":
left = var > start
else:
left = var >= start

if op_prev == "smaller":
right = var < end
else:
right = var <= end
return left & right

def _warn_message(self, name, msgs, pre_warn, start, end=None, type_warn="Overlap"):
# Create a warning message (default overlap) for given values
comp_str = f"range {start}:{end}"
if (start == end) or (end is None):
comp_str = f"number {start}"
msgs.append(f"{pre_warn}{type_warn} for variable {name} in {comp_str}")
return msgs
Loading
Loading