feat(Backend): Add Opensearch as a backend provider

google · Aug 29, 2023 · 5aed69a · 5aed69a
1 parent 5171318
commit 5aed69a
Show file tree

Hide file tree

Showing 13 changed files with 447 additions and 3 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -22,6 +22,6 @@ RUN apt-get update && \
 ADD . /app
 WORKDIR /app
 RUN pip install -U setuptools
-RUN pip install ."[api, datadog, dynatrace, prometheus, elasticsearch, splunk, pubsub, cloud_monitoring, cloud_service_monitoring, cloud_storage, bigquery, cloudevent, dev]"
+RUN pip install ."[api, datadog, dynatrace, prometheus, elasticsearch, opensearch-py, splunk, pubsub, cloud_monitoring, cloud_service_monitoring, cloud_storage, bigquery, cloudevent, dev]"
 ENTRYPOINT [ "slo-generator" ]
 CMD ["-v"]
diff --git a/Makefile b/Makefile
@@ -58,7 +58,7 @@ develop: install
 	pre-commit install
 
 install: clean
-	$(PIP) install -e ."[api, datadog, prometheus, elasticsearch, splunk, pubsub, cloud_monitoring, bigquery, dev]"
+	$(PIP) install -e ."[api, datadog, prometheus, elasticsearch, opensearch-py, splunk, pubsub, cloud_monitoring, bigquery, dev]"
 
 uninstall: clean
 	$(PIP) freeze --exclude-editable | xargs $(PIP) uninstall -y
@@ -102,7 +102,7 @@ bandit:
 safety:
 	safety check
 
-integration: int_cm int_csm int_custom int_dd int_dt int_es int_prom int_sp
+integration: int_cm int_csm int_custom int_dd int_dt int_es int_prom int_sp int_os
 
 int_cm:
 	slo-generator compute -f samples/cloud_monitoring -c samples/config.yaml
@@ -128,6 +128,9 @@ int_prom:
 int_sp:
 	slo-generator compute -f samples/splunk -c samples/config.yaml
 
+int_os:
+	slo-generator compute -f samples/opensearch -c samples/config.yaml
+
 # Run API locally
 run_api:
 	slo-generator api --target=run_compute --signature-type=http -c samples/config.yaml

diff --git a/docs/providers/opensearch.md b/docs/providers/opensearch.md
@@ -0,0 +1,97 @@
+# Elasticsearch
+
+## Backend
+
+Using the `opensearch` backend class, you can query any metrics available in Opensearch to create an SLO.
+
+```yaml
+backends:
+  opensearch:
+    url: ${OPENSEARCH_URL}
+```
+
+Note that `url` can be either a single string (when connecting to a single node) or a list of strings (when connecting to multiple nodes):
+
+```yaml
+backends:
+  opensearch:
+    url: https://localhost:9200
+```
+
+```yaml
+backends:
+  opensearch:
+    url:
+      - https://localhost:9200
+      - https://localhost:9201
+```
+
+The following method is available to compute SLOs with the `opensearch` backend:
+
+* `good_bad_ratio` method is used to compute the ratio between two metrics:
+
+* **Good events**, i.e events we consider as 'good' from the user perspective.
+* **Bad or valid events**, i.e events we consider either as 'bad' from the user perspective, or all events we consider as 'valid' for the computation of the SLO.
+
+This method is often used for availability SLOs, but can be used for other purposes as well (see examples).
+
+**SLO example:**
+
+```yaml
+  backend: opensearch
+  method: good_bad_ratio
+  service_level_indicator:
+    index: my-index
+    date_field: '@timestamp'
+    query_good:
+      must:
+        range:
+          api-response-time:
+            lt: 350
+    query_bad:
+      must:
+        range:
+          api-response-time:
+            gte: 350
+```
+
+Additional info:
+
+* `date_field`: Has to be a valid Opensearch `timestamp` type
+
+**&rightarrow; [Full SLO config](../../samples/opensearch/slo_opensearch_latency_sli.yaml)**
+
+You can also use the `filter_bad` field which identifies bad events instead of the `filter_valid` field which identifies all valid events.
+
+The Lucene query entered in either the `query_good`, `query_bad` or `query_valid` fields will be combined (using the `bool` operator) into a larger query that filters results on the `window` specified in your Error Budget Policy steps.
+
+The full `Opensearh` query body for the `query_bad` above will therefore look like:
+
+```json
+{
+  "query": {
+    "bool": {
+      "must": {
+        "range": {
+          "api-response-time": {
+            "gte": 350
+          }
+        }
+      },
+      "filter": {
+        "range": {
+          "@timestamp": {
+            "gte": "now-3600s/s",
+            "lt": "now/s"
+          }
+        }
+      }
+    }
+  },
+  "track_total_hits": true
+}
+```
+
+### Examples
+
+Complete SLO samples using the `opensearch` backend are available in [samples/elasticsearch](../../samples/opensearch). Check them out!
diff --git a/samples/.env.sample b/samples/.env.sample
@@ -21,3 +21,4 @@ export DYNATRACE_API_TOKEN=
 export BIGQUERY_PROJECT_ID=
 export BIGQUERY_DATASET_ID=
 export BIGQUERY_TABLE_ID=
+export OPENSEARCH_URL=
diff --git a/samples/config.yaml b/samples/config.yaml
@@ -24,6 +24,8 @@ backends:
     port: ${SPLUNK_PORT}
     user: ${SPLUNK_USER}
     password: ${SPLUNK_PWD}
+  opensearch:
+    url: ${OPENSEARCH_URL}
 
 exporters:
   cloudevent:

diff --git a/samples/opensearch/slo_opensearch_availability_sli.yaml b/samples/opensearch/slo_opensearch_availability_sli.yaml
@@ -0,0 +1,25 @@
+apiVersion: sre.google.com/v2
+kind: ServiceLevelObjective
+metadata:
+  name: open-search-availability
+  labels:
+    service_name: opensearch
+    feature_name: opensearch-availability
+    slo_name: availability
+spec:
+  description: 99% of the element are valid
+  backend: opensearch
+  method: good_bad_ratio
+  exporters: []
+  service_level_indicator:
+    index: my-index
+    date_field: '@timestamp'
+    query_good:
+      must:
+        term:
+          status: 200
+    query_bad:
+      must_not:
+        term:
+          status: 200
+  goal: 0.99
diff --git a/samples/opensearch/slo_opensearch_latency_sli.yaml b/samples/opensearch/slo_opensearch_latency_sli.yaml
@@ -0,0 +1,27 @@
+apiVersion: sre.google.com/v2
+kind: ServiceLevelObjective
+metadata:
+  name: open-search-latency
+  labels:
+    service_name: opensearch
+    feature_name: opensearch-latency
+    slo_name: latency
+spec:
+  description: 99% of the element are valid
+  backend: opensearch
+  method: good_bad_ratio
+  exporters: []
+  service_level_indicator:
+    index: my-index
+    date_field: '@timestamp'
+    query_good:
+      must:
+        range:
+          api-response-time:
+            lt: 350
+    query_bad:
+      must:
+        range:
+          api-response-time:
+            gte: 350
+  goal: 0.99
diff --git a/setup.cfg b/setup.cfg
@@ -91,6 +91,8 @@ cloud_storage =
     google-cloud-storage
 elasticsearch =
     elasticsearch
+opensearch =
+    opensearch-py
 splunk =
     splunk-sdk
 pubsub =

diff --git a/slo_generator/backends/opensearch.py b/slo_generator/backends/opensearch.py
@@ -0,0 +1,143 @@
+"""
+`opensearch.py`
+Opensearch backend implementation.
+"""
+
+import copy
+import logging
+
+from opensearchpy import OpenSearch
+
+from slo_generator.constants import NO_DATA
+
+LOGGER = logging.getLogger(__name__)
+
+
+class OpensearchBackend:
+    """Backend for querying metrics from OpenSearch.
+
+    Args:
+        client(opensearch.OpenSearch): Existing OS client.
+        os_config(dict): OS client configuration.
+    """
+
+    def __init__(self, client=None, **os_config):
+        self.client = client
+        if self.client is None:
+            conf = copy.deepcopy(os_config)
+            url = conf.pop("url", None)
+            basic_auth = conf.pop("basic_auth", None)
+            api_key = conf.pop("api_key", None)
+            if url:
+                conf["hosts"] = url
+            if basic_auth:
+                conf["basic_auth"] = (basic_auth["username"], basic_auth["password"])
+            if api_key:
+                conf["api_key"] = (api_key["id"], api_key["value"])
+
+            self.client = OpenSearch(**conf)
+
+    # pylint: disable=unused-argument
+    def good_bad_ratio(self, timestamp, window, slo_config):
+        """Query two timeseries, one containing 'good' events, one containing
+        'bad' events.
+
+        Args:
+            timestamp(int): UNIX timestamp.
+            window(int): Window size (in seconds).
+            slo_config(dict): SLO configuration.
+              spec:
+                method: "good_bad_ratio"
+                service_level_indicator:
+                  query_good(str): the search query to look for good events
+                  query_bad(str): the search query to look for ba events
+                  query_valid(str): the search query to look for valid events
+
+        Returns:
+            tuple: good_event_count, bad_event_count
+        """
+        measurement = slo_config["spec"]["service_level_indicator"]
+        index = measurement["index"]
+        query_good = measurement["query_good"]
+        query_bad = measurement.get("query_bad")
+        query_valid = measurement.get("query_valid")
+        date_field = measurement.get("date_field")
+
+        good = OS.build_query(query_good, window, date_field)
+        bad = OS.build_query(query_bad, window, date_field)
+        valid = OS.build_query(query_valid, window, date_field)
+
+        good_events_count = OS.count(self.query(index, good))
+
+        if query_bad is not None:
+            bad_events_count = OS.count(self.query(index, bad))
+        elif query_valid is not None:
+            bad_events_count = OS.count(self.query(index, valid)) - good_events_count
+        else:
+            raise ValueError("`filter_bad` or `filter_valid` is required.")
+
+        return good_events_count, bad_events_count
+
+    def query(self, index, body):
+        """Query Opensearch server.
+
+        Args:
+            index(str): Index to query.
+            body(dict): Query body.
+
+        Returns:
+            dict: Response.
+        """
+        return self.client.search(index=index, body=body)
+
+    @staticmethod
+    def count(response):
+        """Count event in opensearch response.
+
+        Args:
+            response(dict): Opensearch query response.
+
+        Returns:
+            int: Event count.
+        """
+        try:
+            return response["hits"]["total"]["value"]
+        except KeyError as exception:
+            LOGGER.warning("Couldn't find any values in timeseries response")
+            LOGGER.debug(exception, exc_info=True)
+            return NO_DATA
+
+    @staticmethod
+    def build_query(query, window, date_field):
+        """Build Opensearch query.
+
+        Add window to existing query.
+        Replace window for different error budget steps on-the-fly.
+
+        Args:
+            query(dict): Existing query body.
+            window(int): Window in seconds.
+            date_field(str): Field to filter time on
+
+        Returns:
+            dict: Query body with range clause added.
+        """
+        if query is None:
+            return None
+        body = {"query": {"bool": query}, "track_total_hits": True}
+        range_query = {
+            f"{date_field}": {
+                "gte": f"now-{window}s/s",
+                "lt": "now/s",
+            }
+        }
+
+        if "filter" in body["query"]["bool"]:
+            body["query"]["bool"]["filter"]["range"] = range_query
+        else:
+            body["query"]["bool"]["filter"] = {"range": range_query}
+
+        return body
+
+
+OS = OpensearchBackend