google · bkamin29 · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/setup.cfg b/setup.cfg
@@ -76,8 +76,10 @@ prometheus =
 datadog =
     datadog
     retrying==1.3.4
+    datadog_api_client
 dynatrace =
     requests
+    retrying==1.3.4
 bigquery =
     google-api-python-client
     google-cloud-bigquery

diff --git a/slo_generator/backends/datadog.py b/slo_generator/backends/datadog.py
@@ -17,42 +17,63 @@
 """
 
 import logging
+import os
 import pprint
 
-import datadog
-
-from slo_generator import utils
-
-LOGGER = logging.getLogger(__name__)
-logging.getLogger("datadog.api").setLevel(logging.ERROR)
+from datadog_api_client.v1 import ApiClient, ApiException, Configuration
+from datadog_api_client.v1.api.authentication_api import AuthenticationApi
+from datadog_api_client.v1.api.metrics_api import MetricsApi
+from datadog_api_client.v1.api.service_level_objectives_api import (
+    ServiceLevelObjectivesApi,
+)
+
+# Configure logging
+logging.basicConfig(level=os.environ.get("LOGLEVEL", "ERROR").upper(), force=True)
+logger = logging.getLogger(__name__)
+
+
+class DatadogClient:
+    def __init__(self, api_key=None, app_key=None, api_host=None, **kwargs):
+        configuration = Configuration(
+            host=api_host,
+            enable_retry=True,
+            retry_backoff_factor=2,
+            max_retries=5,
+            **kwargs,
+        )
+        configuration.api_key["apiKeyAuth"] = api_key
+        configuration.api_key["appKeyAuth"] = app_key
+        self.api_client = ApiClient(configuration)
+        AuthenticationApi(self.api_client).validate()
+        self.slo_api_client = ServiceLevelObjectivesApi(self.api_client)
+        self.metrics_api_client = MetricsApi(self.api_client)
 
 
 class DatadogBackend:
     """Backend for querying metrics from Datadog.
-
     Args:
         client (obj, optional): Existing Datadog client to pass.
         api_key (str): Datadog API key.
         app_key (str): Datadog APP key.
+        app_host (str): Datadog site.
         kwargs (dict): Extra arguments to pass to initialize function.
     """
 
-    def __init__(self, client=None, api_key=None, app_key=None, **kwargs):
+    def __init__(
+        self, client=None, api_key=None, app_key=None, api_host=None, **kwargs
+    ):
         self.client = client
         if not self.client:
-            options = {"api_key": api_key, "app_key": app_key}
-            options.update(kwargs)
-            datadog.initialize(**options)
-            self.client = datadog.api
+            self.client = DatadogClient(
+                api_key=api_key, app_key=app_key, api_host=api_host, **kwargs
+            )
 
     def good_bad_ratio(self, timestamp, window, slo_config):
         """Query SLI value from good and valid queries.
-
         Args:
             timestamp (int): UNIX timestamp.
             window (int): Window (in seconds).
             slo_config (dict): SLO configuration.
-
         Returns:
             tuple: Good event count, Bad event count.
         """
@@ -77,9 +98,9 @@ def good_bad_ratio(self, timestamp, window, slo_config):
             operator_suffix,
         )
 
-        good_event_query = self.client.Metric.query(
-            start=start,
-            end=end,
+        good_event_query = self.client.metrics_api_client.query_metrics(
+            _from=int(start),
+            to=int(end),
             query=query_good,
         )
 
@@ -90,9 +111,9 @@ def good_bad_ratio(self, timestamp, window, slo_config):
             operator_suffix,
         )
 
-        event_query = self.client.Metric.query(
-            start=start,
-            end=end,
+        event_query = self.client.metrics_api_client.query_metrics(
+            _from=int(start),
+            to=int(end),
             query=query,
         )
 
@@ -101,18 +122,18 @@ def good_bad_ratio(self, timestamp, window, slo_config):
         if measurement.get("query_valid"):
             event_count = event_count - good_event_count
 
-        LOGGER.debug(f"Good events: {good_event_count} | " f"Bad events: {event_count}")
+        logging.debug(
+            f"Good events: {good_event_count} | " f"Bad events: {event_count}"
+        )
 
         return good_event_count, event_count
 
     def query_sli(self, timestamp, window, slo_config):
         """Query SLI value directly.
-
         Args:
             timestamp (int): UNIX timestamp.
             window (int): Window (in seconds).
             slo_config (dict): SLO configuration.
-
         Returns:
             float: SLI value.
         """
@@ -121,59 +142,82 @@ def query_sli(self, timestamp, window, slo_config):
         end = timestamp
         query = measurement["query"]
         query = self._fmt_query(query, window)
-        response = self.client.Metric.query(start=start, end=end, query=query)
-        LOGGER.debug(f"Result valid: {pprint.pformat(response)}")
+        response = self.client.metrics_api_client.query_metrics(
+            _from=int(start), to=int(end), query=query
+        )
+        logging.debug(f"Result valid: {pprint.pformat(response)}")
         return DatadogBackend.count(response, average=True)
 
     def query_slo(self, timestamp, window, slo_config):
         """Query SLO value from a given Datadog SLO.
-
         Args:
             timestamp (int): UNIX timestamp.
             window (int): Window (in seconds).
             slo_config (dict): SLO configuration.
-
         Returns:
             tuple: Good event count, bad event count.
         """
         slo_id = slo_config["spec"]["service_level_indicator"]["slo_id"]
         from_ts = timestamp - window
-        if utils.is_debug_enabled():
-            slo_data = self.client.ServiceLevelObjective.get(id=slo_id)
-            LOGGER.debug(f"SLO data: {slo_id} | Result: {pprint.pformat(slo_data)}")
-        data = self.client.ServiceLevelObjective.history(
-            id=slo_id,
-            from_ts=from_ts,
-            to_ts=timestamp,
-        )
+
         try:
-            LOGGER.debug(f"Timeseries data: {slo_id} | Result: {pprint.pformat(data)}")
-            good_event_count = data["data"]["series"]["numerator"]["sum"]
-            valid_event_count = data["data"]["series"]["denominator"]["sum"]
-            bad_event_count = valid_event_count - good_event_count
-            return (good_event_count, bad_event_count)
-        except KeyError as exception:  # monitor-based SLI
-            sli_value = data["data"]["overall"]["sli_value"] / 100
-            LOGGER.debug(exception)
-            return sli_value
+            # Retrieve the SLO history
+            data = self.client.slo_api_client.get_slo_history(
+                slo_id, from_ts=int(from_ts), to_ts=int(timestamp)
+            )
+            logging.info(f"SLO history: {data}")
+        except ApiException as e:
+            logging.error(f"Error retrieving SLO history: {e}")
+            return None, None
+
+        # Check if the data is present and properly structured
+        try:
+            logging.debug(f"Timeseries data: {slo_id} | Result: {pprint.pformat(data)}")
+
+            # Check if necessary keys exist before accessing them
+            good_event_count = (
+                data.get("data", {})
+                .get("series", {})
+                .get("numerator", {})
+                .get("sum", 0)
+            )
+            valid_event_count = (
+                data.get("data", {})
+                .get("series", {})
+                .get("denominator", {})
+                .get("sum", 0)
+            )
+
+            if good_event_count is not None and valid_event_count is not None:
+                bad_event_count = valid_event_count - good_event_count
+                return good_event_count, bad_event_count
+
+        except KeyError as exception:  # Monitor-based SLI case
+            logging.debug(f"KeyError exception: {exception}")
+            # Retrieve the SLI value if it's a monitor-based SLI
+            sli_value = (
+                data.get("data", {}).get("overall", {}).get("sli_value", 0) / 100
+            )
+            return (
+                sli_value,
+                None,
+            )  # Return None for bad_event_count if it's not a standard SLO
+
+        # If the data is invalid or there's an issue, return None for both counts
+        return None, None
 
     @staticmethod
     def _fmt_query(query, window, operator=None, operator_suffix=None):
         """Format Datadog query:
-
         * If the Datadog expression has a `[window]` placeholder, replace it by
         the current window. Otherwise, append it to the expression.
-
         * If prefix / suffix operators are defined, apply them to the metric.
-
         * If labels are defined, append them to existing labels.
-
         Args:
             query (str): Original query in YAML config.
             window (int): Query window (in seconds).
             operator (str): Operator (e.g: sum, avg, median, ...)
             operator_suffix (str): Operator suffix (e.g: as_count(), ...)
-
         Returns:
             str: Formatted query.
         """
@@ -184,25 +228,23 @@ def _fmt_query(query, window, operator=None, operator_suffix=None):
             query = query.replace("[window]", f"{window}")
         if operator_suffix:
             query = f"{query}.{operator_suffix}"
-        LOGGER.debug(f"Query: {query}")
+        logging.debug(f"Query: {query}")
         return query
 
     @staticmethod
     def count(response, average=False):
         """Count events in time series.
-
         Args:
             response (dict):  Datadog Metrics API response.
             average (bool): Take average of result.
-
         Returns:
             int: Event count.
         """
         try:
             values = []
             pointlist = response["series"][0]["pointlist"]
             for point in pointlist:
-                value = point[1]
+                value = point["value"][1]
                 if value is None:
                     continue
                 values.append(value)
@@ -212,5 +254,5 @@ def count(response, average=False):
                 return sum(values) / len(values)
             return sum(values)
         except (IndexError, AttributeError) as exception:
-            LOGGER.debug(exception)
+            logging.debug(exception)
             return 0  # no events in timeseries