From 41c14707ad713cd78e8264e4b6a46d0b32320dc0 Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Fri, 16 Aug 2024 19:32:47 +0000 Subject: [PATCH] Update otelcontribcol to 0.106.0-gke.2 This change updates otelcoontribcol to latest and modifies to fit breaking changes from 0.104.0 and 0.106.0. Breaking change in 0.104.0 https://github.com/open-telemetry/opentelemetry-collector-contrib/releases/tag/v0.104.0 Breaking change in 0.106.0 https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 - Localhost is now the default setting, while otel-agent and otel-collector require 0.0.0.0, so the feature gate has been removed. - The format of the environment variable was updated to meet the new syntax requirements. The otel-agent ConfigMap was split between the reconciler and controllers, ensuring that sync-related labels are only applied to reconcilers. - A `no_op_label` has been added to ensure that the aggregation in the metricstransform processor filters on all metric labels. This is a temporary workaround until a permanent fix is implemented upstream. --- Makefile | 2 +- manifests/base/kustomization.yaml | 1 + manifests/otel-agent-cm.yaml | 20 +----- manifests/otel-agent-reconciler-cm.yaml | 69 +++++++++++++++++++ manifests/templates/otel-collector.yaml | 1 + .../reconciler-manager-configmap.yaml | 9 +-- manifests/templates/reconciler-manager.yaml | 1 + .../templates/resourcegroup-manifest.yaml | 1 + pkg/metrics/otel.go | 20 ++++-- .../controllers/otel_controller_test.go | 2 +- 10 files changed, 96 insertions(+), 30 deletions(-) create mode 100644 manifests/otel-agent-reconciler-cm.yaml diff --git a/Makefile b/Makefile index cd64568504..1ff6c33904 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,7 @@ HELM_STAGING_DIR := $(OUTPUT_DIR)/third_party/helm GIT_SYNC_VERSION := v4.2.3-gke.5__linux_amd64 GIT_SYNC_IMAGE_NAME := gcr.io/config-management-release/git-sync:$(GIT_SYNC_VERSION) -OTELCONTRIBCOL_VERSION := v0.103.0-gke.3 +OTELCONTRIBCOL_VERSION := v0.106.0-gke.2 OTELCONTRIBCOL_IMAGE_NAME := gcr.io/config-management-release/otelcontribcol:$(OTELCONTRIBCOL_VERSION) # Directory used for staging Docker contexts. diff --git a/manifests/base/kustomization.yaml b/manifests/base/kustomization.yaml index 1897fe6574..8e6f4bfbb0 100644 --- a/manifests/base/kustomization.yaml +++ b/manifests/base/kustomization.yaml @@ -22,6 +22,7 @@ resources: - ../ns-reconciler-base-cluster-role.yaml - ../root-reconciler-base-cluster-role.yaml - ../otel-agent-cm.yaml +- ../otel-agent-reconciler-cm.yaml - ../reconciler-manager-service-account.yaml - ../reposync-crd.yaml - ../rootsync-crd.yaml diff --git a/manifests/otel-agent-cm.yaml b/manifests/otel-agent-cm.yaml index db8093e3ae..b1173ffe6b 100644 --- a/manifests/otel-agent-cm.yaml +++ b/manifests/otel-agent-cm.yaml @@ -32,24 +32,6 @@ data: tls: insecure: true processors: - # Attributes processor adds custom configsync metric labels to applicable - # metrics to identify the sync object used to configure this deployment. - # - # Note: configsync.sync.generation is explicitly excluded here, because it - # is high cardinality. So we don't want to send it as a label, only as a - # resource attribute. That way it's only propagated to Prometheus, and not - # Monarch or Cloud Monitoring, which ignore custom resource attributes. - attributes: - actions: - - key: configsync.sync.kind - action: upsert - value: $CONFIGSYNC_SYNC_KIND - - key: configsync.sync.name - action: upsert - value: $CONFIGSYNC_SYNC_NAME - - key: configsync.sync.namespace - action: upsert - value: $CONFIGSYNC_SYNC_NAMESPACE batch: # Populate resource attributes from OTEL_RESOURCE_ATTRIBUTES env var and # the GCE metadata service, if available. @@ -62,7 +44,7 @@ data: pipelines: metrics: receivers: [opencensus] - processors: [batch, resourcedetection, attributes] + processors: [batch, resourcedetection] exporters: [opencensus] telemetry: logs: diff --git a/manifests/otel-agent-reconciler-cm.yaml b/manifests/otel-agent-reconciler-cm.yaml new file mode 100644 index 0000000000..d8b437e85c --- /dev/null +++ b/manifests/otel-agent-reconciler-cm.yaml @@ -0,0 +1,69 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-agent-reconciler + namespace: config-management-system + labels: + app: opentelemetry + component: otel-agent + configmanagement.gke.io/system: "true" + configmanagement.gke.io/arch: "csmr" +data: + otel-agent-reconciler-config.yaml: | + receivers: + opencensus: + exporters: + opencensus: + endpoint: otel-collector.config-management-monitoring:55678 + tls: + insecure: true + processors: + # Attributes processor adds custom configsync metric labels to applicable + # metrics to identify the sync object used to configure this deployment. + # + # Note: configsync.sync.generation is explicitly excluded here, because it + # is high cardinality. So we don't want to send it as a label, only as a + # resource attribute. That way it's only propagated to Prometheus, and not + # Monarch or Cloud Monitoring, which ignore custom resource attributes. + attributes: + actions: + - key: configsync.sync.kind + action: upsert + value: ${CONFIGSYNC_SYNC_KIND} + - key: configsync.sync.name + action: upsert + value: ${CONFIGSYNC_SYNC_NAME} + - key: configsync.sync.namespace + action: upsert + value: ${CONFIGSYNC_SYNC_NAMESPACE} + batch: + # Populate resource attributes from OTEL_RESOURCE_ATTRIBUTES env var and + # the GCE metadata service, if available. + resourcedetection: + detectors: [env, gcp] + extensions: + health_check: + service: + extensions: [health_check] + pipelines: + metrics: + receivers: [opencensus] + processors: [batch, resourcedetection, attributes] + exporters: [opencensus] + telemetry: + logs: + level: "INFO" diff --git a/manifests/templates/otel-collector.yaml b/manifests/templates/otel-collector.yaml index dfdcf6a830..933fcbe84f 100644 --- a/manifests/templates/otel-collector.yaml +++ b/manifests/templates/otel-collector.yaml @@ -101,6 +101,7 @@ spec: # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" + - "--feature-gates=-component.UseLocalHostAsDefaultHost" resources: limits: cpu: 1 diff --git a/manifests/templates/reconciler-manager-configmap.yaml b/manifests/templates/reconciler-manager-configmap.yaml index 7327d691cc..410ff1b6a1 100644 --- a/manifests/templates/reconciler-manager-configmap.yaml +++ b/manifests/templates/reconciler-manager-configmap.yaml @@ -168,10 +168,11 @@ data: command: - /otelcontribcol args: - - "--config=/conf/otel-agent-config.yaml" + - "--config=/conf/otel-agent-reconciler-config.yaml" # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" + - "--feature-gates=-component.UseLocalHostAsDefaultHost" securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true @@ -184,7 +185,7 @@ data: - containerPort: 8888 # Metrics. protocol: TCP volumeMounts: - - name: otel-agent-config-vol + - name: otel-agent-config-reconciler-vol mountPath: /conf readinessProbe: httpGet: @@ -273,9 +274,9 @@ data: secret: secretName: git-creds defaultMode: 288 - - name: otel-agent-config-vol + - name: otel-agent-config-reconciler-vol configMap: - name: otel-agent + name: otel-agent-reconciler defaultMode: 420 - name: service-account emptyDir: {} diff --git a/manifests/templates/reconciler-manager.yaml b/manifests/templates/reconciler-manager.yaml index d280ae2401..352eaab34b 100644 --- a/manifests/templates/reconciler-manager.yaml +++ b/manifests/templates/reconciler-manager.yaml @@ -71,6 +71,7 @@ spec: # The prometheus transformer appends `_ratio` to gauge metrics: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/v0.86.0/pkg/translator/prometheus/normalize_name.go#L149 # Add the feature gate to enable metric suffix trimming. - "--feature-gates=-pkg.translator.prometheus.NormalizeName" + - "--feature-gates=-component.UseLocalHostAsDefaultHost" resources: limits: cpu: 1 diff --git a/manifests/templates/resourcegroup-manifest.yaml b/manifests/templates/resourcegroup-manifest.yaml index c4c34301e7..67e074f38c 100644 --- a/manifests/templates/resourcegroup-manifest.yaml +++ b/manifests/templates/resourcegroup-manifest.yaml @@ -232,6 +232,7 @@ spec: - args: - --config=/conf/otel-agent-config.yaml - --feature-gates=-pkg.translator.prometheus.NormalizeName + - --feature-gates=-component.UseLocalHostAsDefaultHost command: - /otelcontribcol env: diff --git a/pkg/metrics/otel.go b/pkg/metrics/otel.go index a3808457e7..5b75eb06ca 100644 --- a/pkg/metrics/otel.go +++ b/pkg/metrics/otel.go @@ -184,7 +184,9 @@ processors: new_name: current_declared_resources operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max - include: kcc_resource_count action: update @@ -255,14 +257,18 @@ processors: new_name: resource_conflicts_count operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max - include: internal_errors_total action: update new_name: internal_errors_count operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max - include: remediate_duration_seconds action: update @@ -322,13 +328,17 @@ processors: action: update operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max - include: kustomize_build_latency action: update operations: - action: aggregate_labels - label_set: [] + # Using a no_op_label to get around issue in the upstream + # https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/34430 + label_set: [no_op_label] aggregation_type: max extensions: health_check: diff --git a/pkg/reconcilermanager/controllers/otel_controller_test.go b/pkg/reconcilermanager/controllers/otel_controller_test.go index 693f4c8aef..acef7e4cc1 100644 --- a/pkg/reconcilermanager/controllers/otel_controller_test.go +++ b/pkg/reconcilermanager/controllers/otel_controller_test.go @@ -48,7 +48,7 @@ const ( // otel-collector ConfigMap. // See `CollectorConfigGooglecloud` in `pkg/metrics/otel.go` // Used by TestOtelReconcilerGooglecloud. - depAnnotationGooglecloud = "c2f6078a9afe1f32721173e9e15bbab5" + depAnnotationGooglecloud = "bfa02552b80a227256e825c807254b40" // depAnnotationGooglecloud is the expected hash of the custom // otel-collector ConfigMap test artifact. // Used by TestOtelReconcilerCustom.