From 261656e172537a9e798aca7bb8c82e5dbfb7995c Mon Sep 17 00:00:00 2001 From: thomasvn Date: Fri, 25 Oct 2024 11:47:46 -0700 Subject: [PATCH 1/4] Remove docs which are no longer referenced in Helm chart. --- SUMMARY.md | 1 - architecture/architecture.md | 3 +- architecture/ports.md | 1 - .../custom-prom/grafana-mimir-integration.md | 11 - .../high-availability.md | 61 ----- .../windows-node-support.md | 5 - .../azure-out-of-cluster.md | 6 - .../install/etl-backup/etl-backup.md | 228 ------------------ .../etl-backup/query-service-replicas.md | 55 ----- .../install/etl-backup/sharing-etl-backups.md | 28 --- 10 files changed, 1 insertion(+), 398 deletions(-) delete mode 100644 install-and-configure/advanced-configuration/high-availability.md delete mode 100644 install-and-configure/install/etl-backup/etl-backup.md delete mode 100644 install-and-configure/install/etl-backup/query-service-replicas.md delete mode 100644 install-and-configure/install/etl-backup/sharing-etl-backups.md diff --git a/SUMMARY.md b/SUMMARY.md index 03f888459..e9128ea67 100644 --- a/SUMMARY.md +++ b/SUMMARY.md @@ -171,7 +171,6 @@ * [Kubecost Metrics](architecture/user-metrics.md) * [Kube-State-Metrics (KSM) Emission](architecture/ksm-metrics.md) * [ContainerStats Pipeline](architecture/containerstats-pipeline.md) -* [High Availability Mode](architecture/high-availability.md) * [GPU Allocation](architecture/gpu-allocation.md) * [Kubecost Cluster Roles](architecture/kubecost-cluster-roles.md) * [Pricing Sources Matrix](architecture/pricing-sources-matrix.md) diff --git a/architecture/architecture.md b/architecture/architecture.md index 0958c12f9..7053c7c5a 100644 --- a/architecture/architecture.md +++ b/architecture/architecture.md @@ -9,8 +9,7 @@ Below are the major components deployed with the [Kubecost Helm chart](/install- 1. Prometheus server: Time-series data store for cost and health metrics 2. Kube-state-metrics (optional): Provides Kubernetes API metrics, e.g. resource requests 3. Node-exporter (optional): Provides metrics for reserved instance recommendations, various Kubecost Grafana dashboards, and cluster health alerts - 4. Pushgateway (optional): Provides the ability for users to push new metrics to Prometheus - 5. Alertmanager (optional): Used for custom alerts + 4. Alertmanager (optional): Used for custom alerts 3. Network costs (optional): used for determining network egress costs. See our [Network Traffic Cost Allocation](/using-kubecost/navigating-the-kubecost-ui/cost-allocation/network-allocation.md) doc for more information. 4. Grafana (optional): Provides supporting dashboards for Kubecost product diff --git a/architecture/ports.md b/architecture/ports.md index fa9bab0e3..cb568f882 100644 --- a/architecture/ports.md +++ b/architecture/ports.md @@ -12,7 +12,6 @@ Kubecost components use following ports by default: | cost-analyzer-service - tcp-model | 9003 | | cost-analyzer-service - api-server | 9004 | | prometheus service | 9090 | -| prometheus pushgateway service | 9091 | | prometheus alertmanager-networkpolicy | 9093 | | kubecost-cluster-controller | 9731 | | kube-state-metrics | 8080 | diff --git a/install-and-configure/advanced-configuration/custom-prom/grafana-mimir-integration.md b/install-and-configure/advanced-configuration/custom-prom/grafana-mimir-integration.md index 488ebbd00..b005f9d3a 100644 --- a/install-and-configure/advanced-configuration/custom-prom/grafana-mimir-integration.md +++ b/install-and-configure/advanced-configuration/custom-prom/grafana-mimir-integration.md @@ -194,17 +194,6 @@ data: action: replace target_label: kubernetes_node - - job_name: 'prometheus-pushgateway' - honor_labels: true - - kubernetes_sd_configs: - - role: service - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] - action: keep - regex: pushgateway - - job_name: 'kubernetes-services' metrics_path: /probe diff --git a/install-and-configure/advanced-configuration/high-availability.md b/install-and-configure/advanced-configuration/high-availability.md deleted file mode 100644 index 9e6d1d9c3..000000000 --- a/install-and-configure/advanced-configuration/high-availability.md +++ /dev/null @@ -1,61 +0,0 @@ -# High Availability Kubecost - -{% hint style="warning" %} -High availability mode is no longer supported as of Kubecost v2. -{% endhint %} - -{% hint style="info" %} -High availability mode is only officially supported on Kubecost Enterprise plans. -{% endhint %} - -Running Kubecost in high availability (HA) mode is a feature that relies on multiple Kubecost replica pods implementing the [ETL Bucket Backup](/install-and-configure/install/etl-backup/etl-backup.md) feature combined with a Leader/Follower implementation which ensures that there always exists exactly one leader across all replicas. - -## Leader + Follower - -The Leader/Follower implementation leverages a `coordination.k8s.io/v1` `Lease` resource to manage the election of a leader when necessary. To control access of the backup from the ETL pipelines, a `RWStorageController` is implemented to ensure the following: - -* Followers block on all backup reads, and poll bucket storage for any backup reads every 30 seconds. -* Followers no-op on any backup writes. -* Followers who receive Queries in a backup store will not stack on pending reads, preventing external queries from blocking. -* Followers promoted to Leader will drop all locks and receive write privileges. -* Leaders behave identically to a single Kubecost install. - -## Configuring high availability - -In order to enable the leader/follower and HA features, the following must also be configured: - -* Replicas are set to a value greater than 1 -* ETL FileStore is Enabled (enabled by default) -* [ETL Bucket Backup](/install-and-configure/install/etl-backup/etl-backup.md) is configured - -For example, using our Helm chart, the following is an acceptable configuration: - -```bash -helm install kubecost kubecost/cost-analyzer --namespace kubecost \ - --set kubecostDeployment.leaderFollower.enabled=true \ - --set kubecostDeployment.replicas=5 \ - --set kubecostModel.etlBucketConfigSecret=kubecost-bucket-secret -``` - -This can also be done in the `values.yaml` file within the chart: - -```yaml -kubecostModel: - image: "gcr.io/kubecost1/cost-model" - imagePullPolicy: Always - # ... - # ETL should be enabled with etlFileStoreEnabled: true - etl: true - etlFileStoreEnabled: true - # ... - # ETL Bucket Backup should be configured by passing the configuration secret name - etlBucketConfigSecret: kubecost-bucket-secret - -# Used for HA mode in Enterprise tier -kubecostDeployment: - # Select a number of replicas of Kubecost pods to run - replicas: 5 - # Enable Leader/Follower Election - leaderFollower: - enabled: true -``` \ No newline at end of file diff --git a/install-and-configure/advanced-configuration/windows-node-support.md b/install-and-configure/advanced-configuration/windows-node-support.md index 3fbd94de7..abb493594 100644 --- a/install-and-configure/advanced-configuration/windows-node-support.md +++ b/install-and-configure/advanced-configuration/windows-node-support.md @@ -44,11 +44,6 @@ For DaemonSets, set the affinity to only allow scheduling on Windows nodes: See the list of all deployments and DaemonSets in this [*values-windows-node-affinity.yaml*](https://github.com/kubecost/cost-analyzer-helm-chart/blob/develop/cost-analyzer/values-windows-node-affinity.yaml) file: ``` -kubecostMetrics: - exporter: - nodeSelector: - kubernetes.io/os: linux - nodeSelector: kubernetes.io/os: linux diff --git a/install-and-configure/install/cloud-integration/azure-out-of-cluster/azure-out-of-cluster.md b/install-and-configure/install/cloud-integration/azure-out-of-cluster/azure-out-of-cluster.md index 7bbc42459..b1102859a 100644 --- a/install-and-configure/install/cloud-integration/azure-out-of-cluster/azure-out-of-cluster.md +++ b/install-and-configure/install/cloud-integration/azure-out-of-cluster/azure-out-of-cluster.md @@ -97,9 +97,3 @@ To troubleshoot a configuration that is not yet working: * `$ helm get values kubecost` to verify you've properly set `.Values.kubecostProductConfigs.cloudIntegrationSecret` * Verify that a non-empty CSV file has been created at this path in your Azure Portal Storage Account: `/////`. Ensure new CSVs are being generated every day. * When opening a cost report CSV, ensure that there are rows in the file that do not have a MeterCategory of “Virtual Machines” or “Storage” as these items are ignored because they are in cluster costs. Additionally, make sure that there are items with a UsageDateTime that matches the date you are interested in. - -When reviewing logs: - -* The following error is reflective of Kubecost's previous Azure Cloud Integration method and can be safely disregarded. - - `ERR Error, Failed to locate azure storage config file: /var/azure-storage-config/azure-storage-config.json` diff --git a/install-and-configure/install/etl-backup/etl-backup.md b/install-and-configure/install/etl-backup/etl-backup.md deleted file mode 100644 index 64533c3ef..000000000 --- a/install-and-configure/install/etl-backup/etl-backup.md +++ /dev/null @@ -1,228 +0,0 @@ -# ETL Backup - -{% hint style="warning" %} -We do not recommend enabling ETL Backup in conjunction with [Federated ETL](/install-and-configure/install/multi-cluster/federated-etl/federated-etl.md). -{% endhint %} - -Kubecost's extract, transform, load (ETL) data is a computed cache based on Prometheus's metrics, from which the user can perform all possible Kubecost queries. The ETL data is stored in a persistent volume mounted to the `kubecost-cost-analyzer` pod. - -There are a number of reasons why you may want to backup this ETL data: - -* To ensure a copy of your Kubecost data exists, so you can restore the data if needed -* To reduce the amount of historical data stored in Prometheus, and instead retain historical ETL data - -## Option 1: Automated durable ETL backups and monitoring - -Kubecost provides cloud storage backups for ETL backing storage. Backups are not the typical approach of "halt all reads/writes and dump the database." Instead, the backup system is a transparent feature that will always ensure that local ETL data is backed up, and if local data is missing, it can be retrieved from backup storage. This feature protects users from accidental data loss by ensuring that previously backed-up data can be restored at runtime. - -{% hint style="info" %} -Durable backup storage functionality is supported with a Kubecost Enterprise plan. -{% endhint %} - -When the ETL pipeline collects data, it stores daily and hourly (if configured) cost metrics on a configured storage. This defaults to a PV-based disk storage, but can be configured to use external durable storage on the following providers: - -* AWS S3 -* Azure Blob Storage -* Google Cloud Storage - -### Step 1: Create storage configuration secret - -This configuration secret follows the same layout documented for Thanos [here](https://thanos.io/v0.21/thanos/storage.md). - -You will need to create a file named _object-store.yaml_ using the chosen storage provider configuration (documented below), and run the following command to create the secret from this file: - -{% code overflow="wrap" %} -```bash -kubectl create secret generic -n kubecost --from-file=object-store.yaml -``` -{% endcode %} - -The file must be named _object-store.yaml_. - -
- -Existing Thanos users - -If you have already followed our [Configuring Thanos](/install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md) guide, you can reuse the previously created bucket configuration secret. - -Setting `.Values.kubecostModel.etlBucketConfigSecret=kubecost-thanos` will enable the backup feature. This will back up all ETL data to the same bucket being used by Thanos. - -
- -
- -S3 - -The configuration schema for S3 can be found in this [Thanos documentation](https://thanos.io/v0.21/thanos/storage.md#s3). For reference, here's an example: - -{% code overflow="wrap" %} -```yaml -type: S3 -config: - bucket: "my-bucket" - endpoint: "s3.amazonaws.com" - region: "us-west-2" - access_key: "" - secret_key: "" - insecure: false - signature_version2: false - put_user_metadata: - "X-Amz-Acl": "bucket-owner-full-control" -prefix: "" # Optional. Specify a path within the bucket (e.g. "kubecost/etlbackup"). -``` -{% endcode %} - -
- -
- -Google Cloud Storage - -The configuration schema for Google Cloud Storage can be found in this [Thanos documentation](https://thanos.io/v0.21/thanos/storage.md/#gcs). For reference, here's an example: - -{% code overflow="wrap" %} -```yaml -type: GCS -config: - bucket: "my-bucket" - service_account: |- - { - "type": "service_account", - "project_id": "project", - "private_key_id": "abcdefghijklmnopqrstuvwxyz12345678906666", - "private_key": "-----BEGIN PRIVATE KEY-----\...\n-----END PRIVATE KEY-----\n", - "client_email": "project@kubecost.iam.gserviceaccount.com", - "client_id": "123456789012345678901", - "auth_uri": "https://accounts.google.com/o/oauth2/auth", - "token_uri": "https://oauth2.googleapis.com/token", - "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", - "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/kubecost%40gitpods.iam.gserviceaccount.com" - } -prefix: "" # Optional. Specify a path within the bucket (e.g. "kubecost/etlbackup"). -``` -{% endcode %} - -
- -
- -Azure - -The configuration schema for Azure can be found in this [Thanos documentation](https://thanos.io/v0.21/thanos/storage.md/#azure). For reference, here's an example: - -{% code overflow="wrap" %} -```yaml -type: AZURE -config: - storage_account: "" - storage_account_key: "" - container: "my-bucket" - endpoint: "" -prefix: "" # Optional. Specify a path within the bucket (e.g. "kubecost/etlbackup"). -``` -{% endcode %} - -
- -#### S3 compatible tooling - -
- -Storj - -Because Storj is [S3 compatible](https://docs.storj.io/dcs/api-reference/s3-compatible-gateway/), it can be used as a drop-in replacement for S3. After an S3 Compatible Access Grant has been created, an example configuration would be: - -{% code overflow="wrap" %} -```yaml -type: S3 -config: - bucket: "my-bucket" - endpoint: "gateway.storjshare.io" - access_key: "" - secret_key: "" - insecure: false - signature_version2: false - http_config: - idle_conn_timeout: 90s - response_header_timeout: 2m - insecure_skip_verify: false - trace: - enable: true - part_size: 134217728 -prefix: "" # Optional. Specify a path within the bucket (e.g. "kubecost/etlbackup"). -``` -{% endcode %} - -
- -
- -Hitachi Content Platform (HCP) - -Because HCP is [S3 compatible](https://knowledge.hitachivantara.com/Documents/Storage/HCP\_for\_Cloud\_Scale/1.0.0/Adminstering\_HCP\_for\_cloud\_scale/Getting\_started/02\_Support\_for\_Amazon\_S3\_API), it can be used as a drop-in replacement for S3. To obtain the necessary S3 User Credentials, see [Hitachi's documentation](https://knowledge.hitachivantara.com/Documents/Storage/HCP\_for\_Cloud\_Scale/1.0.0/Adminstering\_HCP\_for\_cloud\_scale/Object\_storage\_management/01\_S3\_User\_Credentials#GUID-6DA3811F-FBC5-4848-B47D-B2297F0902B7). Afterwards, follow the example below to configure the secret. - -For `bucket`, the value should be the folder created in the HCP endpoint bucket, not the pre-existing bucket name. - -{% code overflow="wrap" %} -``` -type: S3 -config: - bucket: "folder name" - endpoint: "gateway.storjshare.io" - access_key: "" - secret_key: "" - insecure: false - signature_version2: false - http_config: - idle_conn_timeout: 90s - response_header_timeout: 2m - insecure_skip_verify: false - trace: - enable: true - part_size: 134217728 -prefix: "" # Optional. Specify a path within the bucket (e.g. "kubecost/etlbackup"). -``` -{% endcode %} - -
- -### Step 2: Enable ETL backup in Helm values - -If Kubecost was installed via Helm, ensure the following value is set. - -```yaml -kubecostModel: - etlBucketConfigSecret: -``` - -### Compatibility - -If you are using an existing disk storage option for your ETL data, enabling the durable backup feature will retroactively back up all previously stored data\*. This feature is also fully compatible with the existing S3 backup feature. - -{% hint style="info" %} -If you are using a memory store for your ETL data with a local disk backup (`kubecostModel.etlFileStoreEnabled: false`), the backup feature will simply replace the local backup. In order to take advantage of the retroactive backup feature, you will need to update to file store (`kubecostModel.etlFileStoreEnabled: true`). This option is now enabled by default in the Helm chart. -{% endhint %} - -## Option 2: Manual backup via Bash script - -The simplest way to backup Kubecost's ETL is to copy the pod's ETL store to your local disk. You can then send that file to any other storage system of your choice. We provide a [script](https://github.com/kubecost/etl-backup) to do that. - -To restore the backup, untar the results of the ETL backup script into the ETL directory pod. - -{% code overflow="wrap" %} -```bash -kubectl cp -c cost-model /bingen /:/var/configs/db/etl -``` -{% endcode %} - -There is also a Bash script available to restore the backup in [Kubecost's etl-backup repo](https://github.com/kubecost/etl-backup/blob/main/upload-etl.sh). - -## Monitoring - -Currently, this feature is still in development, but there is currently a status card available on the Diagnostics page that will eventually show the status of the backup system: - -![Diagnostic ETL Backup Status](/images/diagnostics-etl-backup-status.png) - -## Troubleshooting - -In some scenarios like when using Memory store, setting `kubecostModel.etlHourlyStoreDurationHours` to a value of `48` hours or less will cause ETL backup files to become truncated. The current recommendation is to keep [etlHourlyStoreDurationHours](https://github.com/kubecost/cost-analyzer-helm-chart/blob/8fd5502925c28c56af38b0c4e66c4ec746761d50/cost-analyzer/values.yaml#L322) at its default of `49` hours. diff --git a/install-and-configure/install/etl-backup/query-service-replicas.md b/install-and-configure/install/etl-backup/query-service-replicas.md deleted file mode 100644 index b942e1a0a..000000000 --- a/install-and-configure/install/etl-backup/query-service-replicas.md +++ /dev/null @@ -1,55 +0,0 @@ -# Query Service Replicas - -{% hint style="warning" %} -Query service replicas are no longer supported as of Kubecost v2. -{% endhint %} - -{% hint style="info" %} -This feature is only supported on Kubecost Enterprise plans. -{% endhint %} - -The query service replica (QSR) is a scale-out query service that reduces load on the cost-model pod. It allows for improved horizontal scaling by being able to handle queries for larger intervals, and multiple simultaneous queries. - -## Overview - -The query service will forward `/model/allocation` and `/model/assets` requests to the Query Services StatefulSet. - -The diagram below demonstrates the backing architecture of this query service and its functionality. - -![Query service architecture](/.gitbook/assets/qsr-arch.png) - -## Requirements - -### ETL data source - -There are three options that can be used for the source ETL Files: - -1. For environments that have Kubecost [Federated ETL](/install-and-configure/install/multi-cluster/federated-etl/federated-etl.md) enabled, this store will be used, no additional configuration is required. -2. For single cluster environments, QSR can target the ETL backup store. To learn more about ETL backups, see the [ETL Backup](/install-and-configure/install/etl-backup/etl-backup.md) doc. -3. Alternatively, an object-store containing the ETL dataset to be queried can be configured using a secret `kubecostDeployment.queryServiceConfigSecret`. The file name of the secret must be `object-store.yaml`. Examples can be found in our [Configuring Thanos](/install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md#step-1-create-object-storeyaml) doc. - -### Persistent volume on Kubecost primary instance - -QSR uses persistent volume storage to avoid excessive S3 transfers. Data is retrieved from S3 hourly as new ETL files are created and stored in these PVs. The `databaseVolumeSize` should be larger than the size of the data in the S3 bucket. - -When the pods start, data from the object-store is synced and this can take a significant time in large environments. During the sync, parts of the Kubecost UI will appear broken or have missing data. You can follow the pod logs to see when the sync is complete. - -The default of 100Gi is enough storage for 1M pods and 90 days of retention. This can be adjusted: - -```yaml -kubecostDeployment: - queryServiceReplicas: 2 - queryService: - # default storage class - storageClass: "" - databaseVolumeSize: 100Gi - configVolumeSize: 1G -``` - -## Enabling QSR - -Once the data store is configured, set `kubecostDeployment.queryServiceReplicas` to a non-zero value and perform a Helm upgrade. - -## Usage - -Once QSR has been enabled, the new pods will automatically handle all API requests to `/model/allocation` and `/model/assets`. \ No newline at end of file diff --git a/install-and-configure/install/etl-backup/sharing-etl-backups.md b/install-and-configure/install/etl-backup/sharing-etl-backups.md deleted file mode 100644 index 868484514..000000000 --- a/install-and-configure/install/etl-backup/sharing-etl-backups.md +++ /dev/null @@ -1,28 +0,0 @@ -# Sharing ETL Backups - -This document will describe why your Kubecost instance’s data can be useful to share with us, what content is in the data, and how to share it. - -Kubecost product releases are tested and verified against a combination of generated/synthetic Kubernetes cluster data and examples of customer data that have been shared with us. Customers who share snapshots of their data with us help to ensure that product changes handle their specific use cases and scales. Because the Kubecost product for many customers is run as an on-prem service, with no data sharing back to us, we do not inherently have this data for many of our customers. - -Sharing data with us requires an ETL backup executed by the customer in their own environment before the resulting data can be sent out. Kubecost's ETL is a computed cache built upon Prometheus metrics and cloud billing data, on which nearly all API requests made by the user and the Kubecost frontend currently rely. Therefore, the ETL data will contain metric data and identifying information for that metric (e.g. a container name, pod name, namespace, and cluster name) during a time window, but will not contain other information about containers, pods, clusters, cloud resources, etc. You can read more about these metric details in our [Kubecost Metrics](/architecture/user-metrics.md) doc. - -The full methodology for creating the ETL backup can be found in our [ETL Backup](etl-backup.md) doc. Once these files have been backed up, the content will look as follows before compressing the data: - -```txt -├── etl -│ ├── bingen -│ │ ├── allocations -│ │ │ ├── 1d # data chunks of 1 day -│ │ │ │ ├── filename: {start timestamp}-{end timestamp} -│ │ │ ├── 1h # data chunks of 1 hour -│ │ │ │ ├── filename: {start timestamp}-{end timestamp} -│ │ ├── assets -│ │ │ ├── 1d # data chunks of 1 day -│ │ │ │ ├── filename: {start timestamp}-{end timestamp} -│ │ │ ├── 1h # data chunks of 1 hour -│ │ │ │ ├── filename: {start timestamp}-{end timestamp} -``` - -Once the data is downloaded to the local disk from either the automated or manual ETL backup methods, the data must be converted to a gzip file. A suggested method for downloading the ETL backup and compressing it quickly is to use [this script](https://github.com/kubecost/etl-backup/blob/main/download-etl.sh). Check out the `tar` syntax in that script if doing this manually without the script. When the compressed ETL backup is ready to share, please work with a Kubecost support engineer on sharing the file with us. Our most common approach is to use a Google Drive folder with access limited to you and the support engineer, but we recognize not all companies are open to this and will work with you to determine the most business-appropriate method. - -If you are interested in reviewing the contents of the data, either before or after sending the ETL backup to us, you can find an example Golang implementation on how to read the [raw ETL data](https://github.com/kubecost/etl-backup#run-etl-from-backed-up-data). From 5cd20733c0d001f9c2be0bef49e759cc0abedf34 Mon Sep 17 00:00:00 2001 From: thomasvn Date: Fri, 25 Oct 2024 12:03:59 -0700 Subject: [PATCH 2/4] Fix links. --- .../thanos-setup/configuring-thanos.md | 222 ------------------ .../cluster-right-sizing-recommendations.md | 6 +- 2 files changed, 3 insertions(+), 225 deletions(-) delete mode 100644 install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md diff --git a/install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md b/install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md deleted file mode 100644 index d9f7882f4..000000000 --- a/install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md +++ /dev/null @@ -1,222 +0,0 @@ -# Configuring Thanos (Deprecated) - -{% hint style="warning" %} -As of Kubecost v2, support for Thanos is deprecated. Consider [transitioning to our Aggregator architecture](/install-and-configure/install/multi-cluster/federated-etl/thanos-migration-guide.md) if you plan to upgrade. -{% endhint %} - -{% hint style="info" %} -This feature is only offically available on [Kubecost Enterprise plans](https://www.kubecost.com/pricing/). -{% endhint %} - -Kubecost leverages Thanos and durable storage for three different purposes: - -1. Centralize metric data for a global multi-cluster view into Kubernetes costs via a Prometheus sidecar -2. Allow for unlimited data retention -3. Backup Kubecost [ETL data](/install-and-configure/install/etl-backup/etl-backup.md) - -To enable Thanos, follow these steps: - -## Step 1: Create _object-store.yaml_ - -This step creates the _object-store.yaml_ file that contains your durable storage target (e.g. GCS, S3, etc.) configuration and access credentials. The details of this file are documented thoroughly in [Thanos documentation](https://thanos.io/tip/thanos/storage.md/). - -We have guides for using cloud-native storage for the largest cloud providers. Other providers can be similarly configured. - -Use the appropriate guide for your cloud provider: - -* [Google Cloud Storage](/install-and-configure/install/multi-cluster/long-term-storage-configuration/long-term-storage-gcp.md) -* [AWS/S3](/install-and-configure/install/multi-cluster/long-term-storage-configuration/long-term-storage-aws.md) -* [Azure](/install-and-configure/install/multi-cluster/long-term-storage-configuration/long-term-storage-azure.md) - -## Step 2: Create object-store secret - -Create a secret with the .yaml file generated in the previous step: - -{% code overflow="wrap" %} -```shell -kubectl create secret generic kubecost-thanos -n kubecost --from-file=./object-store.yaml -``` -{% endcode %} - -## Step 3: Unique Cluster ID - -Each cluster needs to be labelled with a unique Cluster ID, which is done in two places. - -`values-clusterName.yaml` - -```yaml -kubecostProductConfigs: - clusterName: kubecostProductConfigs_clusterName -prometheus: - server: - global: - external_labels: - cluster_id: kubecostProductConfigs_clusterName -``` - -## Step 4: Deploying Kubecost with Thanos - -The Thanos subchart includes `thanos-bucket`, `thanos-query`, `thanos-store`, `thanos-compact`, and service discovery for `thanos-sidecar`. These components are recommended when deploying Thanos on the primary cluster. - -These values can be adjusted under the `thanos` block in _values-thanos.yaml_. - -{% code overflow="wrap" %} -```shell -helm upgrade kubecost kubecost/cost-analyzer \ - --install \ - --namespace kubecost \ - -f https://raw.githubusercontent.com/kubecost/cost-analyzer-helm-chart/master/cost-analyzer/values-thanos.yaml \ - -f values-clusterName.yaml -``` -{% endcode %} - -{% hint style="info" %} -The `thanos-store` container is configured to request 2.5GB memory, this may be reduced for smaller deployments. `thanos-store` is only used on the primary Kubecost cluster. -{% endhint %} - -To verify installation, check to see all Pods are in a _READY_ state. View Pod logs for more detail and see common troubleshooting steps below. - -## Troubleshooting - -Thanos sends data to the bucket every 2 hours. Once 2 hours have passed, logs should indicate if data has been sent successfully or not. - -You can monitor the logs with: - -{% code overflow="wrap" %} -```bash -kubectl logs --namespace kubecost -l app=prometheus -l component=server --prefix=true --container thanos-sidecar --tail=-1 | grep uploaded -``` -{% endcode %} - -Monitoring logs this way should return results like this: - -{% code overflow="wrap" %} -```log -[pod/kubecost-prometheus-server-xxx/thanos-sidecar] level=debug ts=2022-06-09T13:00:10.084904136Z caller=objstore.go:206 msg="uploaded file" from=/data/thanos/upload/BUCKETID/chunks/000001 dst=BUCKETID/chunks/000001 bucket="tracing: kc-thanos-store" -``` -{% endcode %} - -As an aside, you can validate the Prometheus metrics are all configured with correct cluster names with: - -{% code overflow="wrap" %} -```bash -kubectl logs --namespace kubecost -l app=prometheus -l component=server --prefix=true --container thanos-sidecar --tail=-1 | grep external_labels -``` -{% endcode %} - -To troubleshoot the IAM Role Attached to the serviceaccount, you can create a Pod using the same service account used by the thanos-sidecar (default is `kubecost-prometheus-server`): - -`s3-pod.yaml` - -```yaml -apiVersion: v1 -kind: Pod -metadata: - labels: - run: s3-pod - name: s3-pod -spec: - serviceAccountName: kubecost-prometheus-server - containers: - - image: amazon/aws-cli - name: my-aws-cli - command: ['sleep', '500'] -``` - -```bash -kubectl apply -f s3-pod.yaml -kubectl exec -i -t s3-pod -- aws s3 ls s3://kc-thanos-store -``` - -This should return a list of objects (or at least not give a permission error). - -### Cluster not writing data to thanos bucket - -If a cluster is not successfully writing data to the bucket, review `thanos-sidecar` logs with the following command: - -```shell -kubectl logs kubecost-prometheus-server- -n kubecost -c thanos-sidecar -``` - -Logs in the following format are evidence of a successful bucket write: - -{% code overflow="wrap" %} -``` -level=debug ts=2019-12-20T20:38:32.288251067Z caller=objstore.go:91 msg="uploaded file" from=/data/thanos/upload/BUCKET-ID/meta.json dst=debug/metas/BUCKET-ID.json bucket=kc-thanos -``` -{% endcode %} - -### Stores not listed at the `/stores` endpoint - -If thanos-query can't connect to both the sidecar and the store, you may want to directly specify the store gRPC service address instead of using DNS discovery (the default). You can quickly test if this is the issue by running: - -`kubectl edit deployment kubecost-thanos-query -n kubecost` - -and adding - -`--store=kubecost-thanos-store-grpc.kubecost:10901` - -to the container args. This will cause a query restart and you can visit `/stores` again to see if the store has been added. - -If it has, you'll want to use these addresses instead of DNS more permanently by setting .Values.thanos.query.stores in _values-thanos.yaml_. - -``` -... -thanos: - store: - enabled: true - grpcSeriesMaxConcurrency: 20 - blockSyncConcurrency: 20 - extraEnv: - - name: GOGC - value: "100" - resources: - requests: - memory: "2.5Gi" - query: - enabled: true - timeout: 3m - # Maximum number of queries processed concurrently by query node. - maxConcurrent: 8 - # Maximum number of select requests made concurrently per a query. - maxConcurrentSelect: 2 - resources: - requests: - memory: "2.5Gi" - autoDownsampling: false - extraEnv: - - name: GOGC - value: "100" - stores: - - "kubecost-thanos-store-grpc.kubecost:10901" -``` - -### Additional Troubleshooting - -A common error is as follows, which means you do not have the correct access to the supplied bucket: - -{% code overflow="wrap" %} -``` -thanos-svc-account@project-227514.iam.gserviceaccount.com does not have storage.objects.list access to thanos-bucket., forbidden" -``` -{% endcode %} - -Assuming pods are running, use port forwarding to connect to the `thanos-query-http` endpoint: - -```shell -kubectl port-forward svc/kubecost-thanos-query-http 8080:10902 --namespace kubecost -``` - -Then navigate to [http://localhost:8080](http://localhost:8080) in your browser. This page should look very similar to the Prometheus console. - -![Thanos query editor](/images/thanos-query.png) - -If you navigate to _Stores_ using the top navigation bar, you should be able to see the status of both the `thanos-store` and `thanos-sidecar` which accompanied the Prometheus server: - -![Thanos stores](/images/thanos-store.png) - -Also note that the sidecar should identify with the unique `cluster_id` provided in your _values.yaml_ in the previous step. Default value is `cluster-one`. - -The default retention period for when data is moved into the object storage is currently _2h_. This configuration is based on Thanos suggested values. **By default, it will be 2 hours before data is written to the provided bucket.** - -Instead of waiting _2h_ to ensure that Thanos was configured correctly, the default log level for the Thanos workloads is `debug` (it's very light logging even on debug). You can get logs for the `thanos-sidecar`, which is part of the `prometheus-server` Pod, and `thanos-store`. The logs should give you a clear indication of whether or not there was a problem consuming the secret and what the issue is. For more on Thanos architecture, view [this resource](https://github.com/thanos-io/thanos/blob/master/docs/design.md). diff --git a/using-kubecost/navigating-the-kubecost-ui/savings/cluster-right-sizing-recommendations.md b/using-kubecost/navigating-the-kubecost-ui/savings/cluster-right-sizing-recommendations.md index 69038abb9..45014c5e2 100644 --- a/using-kubecost/navigating-the-kubecost-ui/savings/cluster-right-sizing-recommendations.md +++ b/using-kubecost/navigating-the-kubecost-ui/savings/cluster-right-sizing-recommendations.md @@ -123,7 +123,7 @@ kubecostProductConfigs: ### Supported instance types The complete lists of supported instance types currently available for each of the supported cloud service providers (AWS, GCP, Azure) can be found in the Helm chart: -- [AWS](https://github.com/kubecost/cost-analyzer-helm-chart/blob/develop/cost-analyzer/savings-recommendations-allow-lists-aws.yaml) -- [Azure](https://github.com/kubecost/cost-analyzer-helm-chart/blob/develop/cost-analyzer/savings-recommendations-allow-lists-azure.yaml) -- [GCP](https://github.com/kubecost/cost-analyzer-helm-chart/blob/develop/cost-analyzer/savings-recommendations-allow-lists-gcp.yaml) +* [AWS](https://github.com/kubecost/cost-analyzer-helm-chart/blob/develop/cost-analyzer/values-savings-rec-allowlist-aws.yaml) +* [Azure](https://github.com/kubecost/cost-analyzer-helm-chart/blob/develop/cost-analyzer/values-savings-rec-allowlist-azure.yaml) +* [GCP](https://github.com/kubecost/cost-analyzer-helm-chart/blob/develop/cost-analyzer/values-savings-rec-allowlist-gcp.yaml) From 0233493a851063683405d560ee26a3895bbaecb9 Mon Sep 17 00:00:00 2001 From: thomasvn Date: Fri, 25 Oct 2024 12:09:46 -0700 Subject: [PATCH 3/4] Fix links pt2 --- .../thanos-setup/configuring-thanos.md | 222 ++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md diff --git a/install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md b/install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md new file mode 100644 index 000000000..2e3534b02 --- /dev/null +++ b/install-and-configure/install/multi-cluster/thanos-setup/configuring-thanos.md @@ -0,0 +1,222 @@ +# Configuring Thanos (Deprecated) + +{% hint style="warning" %} +As of Kubecost v2, support for Thanos is deprecated. Consider [transitioning to our Aggregator architecture](/install-and-configure/install/multi-cluster/federated-etl/thanos-migration-guide.md) if you plan to upgrade. +{% endhint %} + +{% hint style="info" %} +This feature is only offically available on [Kubecost Enterprise plans](https://www.kubecost.com/pricing/). +{% endhint %} + +Kubecost leverages Thanos and durable storage for three different purposes: + +1. Centralize metric data for a global multi-cluster view into Kubernetes costs via a Prometheus sidecar +2. Allow for unlimited data retention +3. Backup Kubecost ETL data + +To enable Thanos, follow these steps: + +## Step 1: Create _object-store.yaml_ + +This step creates the _object-store.yaml_ file that contains your durable storage target (e.g. GCS, S3, etc.) configuration and access credentials. The details of this file are documented thoroughly in [Thanos documentation](https://thanos.io/tip/thanos/storage.md/). + +We have guides for using cloud-native storage for the largest cloud providers. Other providers can be similarly configured. + +Use the appropriate guide for your cloud provider: + +* [Google Cloud Storage](/install-and-configure/install/multi-cluster/long-term-storage-configuration/long-term-storage-gcp.md) +* [AWS/S3](/install-and-configure/install/multi-cluster/long-term-storage-configuration/long-term-storage-aws.md) +* [Azure](/install-and-configure/install/multi-cluster/long-term-storage-configuration/long-term-storage-azure.md) + +## Step 2: Create object-store secret + +Create a secret with the .yaml file generated in the previous step: + +{% code overflow="wrap" %} +```shell +kubectl create secret generic kubecost-thanos -n kubecost --from-file=./object-store.yaml +``` +{% endcode %} + +## Step 3: Unique Cluster ID + +Each cluster needs to be labelled with a unique Cluster ID, which is done in two places. + +`values-clusterName.yaml` + +```yaml +kubecostProductConfigs: + clusterName: kubecostProductConfigs_clusterName +prometheus: + server: + global: + external_labels: + cluster_id: kubecostProductConfigs_clusterName +``` + +## Step 4: Deploying Kubecost with Thanos + +The Thanos subchart includes `thanos-bucket`, `thanos-query`, `thanos-store`, `thanos-compact`, and service discovery for `thanos-sidecar`. These components are recommended when deploying Thanos on the primary cluster. + +These values can be adjusted under the `thanos` block in _values-thanos.yaml_. + +{% code overflow="wrap" %} +```shell +helm upgrade kubecost kubecost/cost-analyzer \ + --install \ + --namespace kubecost \ + -f https://raw.githubusercontent.com/kubecost/cost-analyzer-helm-chart/master/cost-analyzer/values-thanos.yaml \ + -f values-clusterName.yaml +``` +{% endcode %} + +{% hint style="info" %} +The `thanos-store` container is configured to request 2.5GB memory, this may be reduced for smaller deployments. `thanos-store` is only used on the primary Kubecost cluster. +{% endhint %} + +To verify installation, check to see all Pods are in a _READY_ state. View Pod logs for more detail and see common troubleshooting steps below. + +## Troubleshooting + +Thanos sends data to the bucket every 2 hours. Once 2 hours have passed, logs should indicate if data has been sent successfully or not. + +You can monitor the logs with: + +{% code overflow="wrap" %} +```bash +kubectl logs --namespace kubecost -l app=prometheus -l component=server --prefix=true --container thanos-sidecar --tail=-1 | grep uploaded +``` +{% endcode %} + +Monitoring logs this way should return results like this: + +{% code overflow="wrap" %} +```log +[pod/kubecost-prometheus-server-xxx/thanos-sidecar] level=debug ts=2022-06-09T13:00:10.084904136Z caller=objstore.go:206 msg="uploaded file" from=/data/thanos/upload/BUCKETID/chunks/000001 dst=BUCKETID/chunks/000001 bucket="tracing: kc-thanos-store" +``` +{% endcode %} + +As an aside, you can validate the Prometheus metrics are all configured with correct cluster names with: + +{% code overflow="wrap" %} +```bash +kubectl logs --namespace kubecost -l app=prometheus -l component=server --prefix=true --container thanos-sidecar --tail=-1 | grep external_labels +``` +{% endcode %} + +To troubleshoot the IAM Role Attached to the serviceaccount, you can create a Pod using the same service account used by the thanos-sidecar (default is `kubecost-prometheus-server`): + +`s3-pod.yaml` + +```yaml +apiVersion: v1 +kind: Pod +metadata: + labels: + run: s3-pod + name: s3-pod +spec: + serviceAccountName: kubecost-prometheus-server + containers: + - image: amazon/aws-cli + name: my-aws-cli + command: ['sleep', '500'] +``` + +```bash +kubectl apply -f s3-pod.yaml +kubectl exec -i -t s3-pod -- aws s3 ls s3://kc-thanos-store +``` + +This should return a list of objects (or at least not give a permission error). + +### Cluster not writing data to thanos bucket + +If a cluster is not successfully writing data to the bucket, review `thanos-sidecar` logs with the following command: + +```shell +kubectl logs kubecost-prometheus-server- -n kubecost -c thanos-sidecar +``` + +Logs in the following format are evidence of a successful bucket write: + +{% code overflow="wrap" %} +``` +level=debug ts=2019-12-20T20:38:32.288251067Z caller=objstore.go:91 msg="uploaded file" from=/data/thanos/upload/BUCKET-ID/meta.json dst=debug/metas/BUCKET-ID.json bucket=kc-thanos +``` +{% endcode %} + +### Stores not listed at the `/stores` endpoint + +If thanos-query can't connect to both the sidecar and the store, you may want to directly specify the store gRPC service address instead of using DNS discovery (the default). You can quickly test if this is the issue by running: + +`kubectl edit deployment kubecost-thanos-query -n kubecost` + +and adding + +`--store=kubecost-thanos-store-grpc.kubecost:10901` + +to the container args. This will cause a query restart and you can visit `/stores` again to see if the store has been added. + +If it has, you'll want to use these addresses instead of DNS more permanently by setting .Values.thanos.query.stores in _values-thanos.yaml_. + +``` +... +thanos: + store: + enabled: true + grpcSeriesMaxConcurrency: 20 + blockSyncConcurrency: 20 + extraEnv: + - name: GOGC + value: "100" + resources: + requests: + memory: "2.5Gi" + query: + enabled: true + timeout: 3m + # Maximum number of queries processed concurrently by query node. + maxConcurrent: 8 + # Maximum number of select requests made concurrently per a query. + maxConcurrentSelect: 2 + resources: + requests: + memory: "2.5Gi" + autoDownsampling: false + extraEnv: + - name: GOGC + value: "100" + stores: + - "kubecost-thanos-store-grpc.kubecost:10901" +``` + +### Additional Troubleshooting + +A common error is as follows, which means you do not have the correct access to the supplied bucket: + +{% code overflow="wrap" %} +``` +thanos-svc-account@project-227514.iam.gserviceaccount.com does not have storage.objects.list access to thanos-bucket., forbidden" +``` +{% endcode %} + +Assuming pods are running, use port forwarding to connect to the `thanos-query-http` endpoint: + +```shell +kubectl port-forward svc/kubecost-thanos-query-http 8080:10902 --namespace kubecost +``` + +Then navigate to [http://localhost:8080](http://localhost:8080) in your browser. This page should look very similar to the Prometheus console. + +![Thanos query editor](/images/thanos-query.png) + +If you navigate to _Stores_ using the top navigation bar, you should be able to see the status of both the `thanos-store` and `thanos-sidecar` which accompanied the Prometheus server: + +![Thanos stores](/images/thanos-store.png) + +Also note that the sidecar should identify with the unique `cluster_id` provided in your _values.yaml_ in the previous step. Default value is `cluster-one`. + +The default retention period for when data is moved into the object storage is currently _2h_. This configuration is based on Thanos suggested values. **By default, it will be 2 hours before data is written to the provided bucket.** + +Instead of waiting _2h_ to ensure that Thanos was configured correctly, the default log level for the Thanos workloads is `debug` (it's very light logging even on debug). You can get logs for the `thanos-sidecar`, which is part of the `prometheus-server` Pod, and `thanos-store`. The logs should give you a clear indication of whether or not there was a problem consuming the secret and what the issue is. For more on Thanos architecture, view [this resource](https://github.com/thanos-io/thanos/blob/master/docs/design.md). From df513d807a1e8aceba1f52928de190cdd953a09a Mon Sep 17 00:00:00 2001 From: thomasvn Date: Mon, 28 Oct 2024 12:46:03 -0700 Subject: [PATCH 4/4] Revert changes to SUMMARY.md --- SUMMARY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/SUMMARY.md b/SUMMARY.md index e9128ea67..03f888459 100644 --- a/SUMMARY.md +++ b/SUMMARY.md @@ -171,6 +171,7 @@ * [Kubecost Metrics](architecture/user-metrics.md) * [Kube-State-Metrics (KSM) Emission](architecture/ksm-metrics.md) * [ContainerStats Pipeline](architecture/containerstats-pipeline.md) +* [High Availability Mode](architecture/high-availability.md) * [GPU Allocation](architecture/gpu-allocation.md) * [Kubecost Cluster Roles](architecture/kubecost-cluster-roles.md) * [Pricing Sources Matrix](architecture/pricing-sources-matrix.md)