From e43e0112efc72200c5d5877d700fbf47e845a3a6 Mon Sep 17 00:00:00 2001 From: Vaidas Balys Date: Wed, 9 Oct 2024 14:41:41 +0300 Subject: [PATCH 1/3] Configure data request timeout --- config_example.json | 1 + internal/config/config.go | 5 +++++ internal/prometheus/prometheus.go | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/config_example.json b/config_example.json index 93ab8cc..0e4f1c6 100644 --- a/config_example.json +++ b/config_example.json @@ -2,6 +2,7 @@ "graphqlURL": "http://localhost:8090/graphql/", "graphqlAPIToken": "Token SECRET", "cacheExpire": 0, + "timeout": 60, "retryOnError": false, "metricsPrefix": "graphql_exporter_", "queries":[ diff --git a/internal/config/config.go b/internal/config/config.go index 08f1149..d951056 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,6 +11,7 @@ type Cfg struct { GraphqlURL string GraphqlAPIToken string CacheExpire int64 + Timeout int64 RetryOnError bool MetricsPrefix string Queries []Query @@ -57,6 +58,10 @@ func Init(configPath string) error { Config.GraphqlAPIToken = val } + if Config.Timeout == 0 { + Config.Timeout = 60 + } + slog.Info(fmt.Sprintf("Finished reading config from %s", configPath)) return nil } diff --git a/internal/prometheus/prometheus.go b/internal/prometheus/prometheus.go index 8494490..f7f396b 100644 --- a/internal/prometheus/prometheus.go +++ b/internal/prometheus/prometheus.go @@ -158,7 +158,7 @@ func (collector *GraphqlCollector) Describe(ch chan<- *prometheus.Desc) {} func (collector *GraphqlCollector) updateMetrics() error { if time.Now().Unix()-collector.cachedAt > config.Config.CacheExpire { - ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) + ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(config.Config.Timeout)) defer cancel() metrics, err := collector.getMetrics(ctx) collector.accessMu.Lock() From fee0e4dfe75f69f78a39b19ab4e9b7fabcb014b9 Mon Sep 17 00:00:00 2001 From: Vaidas Balys Date: Wed, 9 Oct 2024 15:02:18 +0300 Subject: [PATCH 2/3] Try to execute as many queries as possible, with individual configurable timeouts --- config_example.json | 4 ++-- internal/config/config.go | 6 +++--- internal/prometheus/prometheus.go | 18 ++++++++++-------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/config_example.json b/config_example.json index 0e4f1c6..58637b6 100644 --- a/config_example.json +++ b/config_example.json @@ -1,10 +1,10 @@ { + "metricsPrefix": "graphql_exporter_", "graphqlURL": "http://localhost:8090/graphql/", "graphqlAPIToken": "Token SECRET", "cacheExpire": 0, - "timeout": 60, + "queryTimeout": 60, "retryOnError": false, - "metricsPrefix": "graphql_exporter_", "queries":[ { "query": "query {device_list {name serial custom_fields}} {{NOW \"-1h\"}}", diff --git a/internal/config/config.go b/internal/config/config.go index d951056..8f1464b 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,7 +11,7 @@ type Cfg struct { GraphqlURL string GraphqlAPIToken string CacheExpire int64 - Timeout int64 + QueryTimeout int64 RetryOnError bool MetricsPrefix string Queries []Query @@ -58,8 +58,8 @@ func Init(configPath string) error { Config.GraphqlAPIToken = val } - if Config.Timeout == 0 { - Config.Timeout = 60 + if Config.QueryTimeout == 0 { + Config.QueryTimeout = 60 } slog.Info(fmt.Sprintf("Finished reading config from %s", configPath)) diff --git a/internal/prometheus/prometheus.go b/internal/prometheus/prometheus.go index f7f396b..3baafb4 100644 --- a/internal/prometheus/prometheus.go +++ b/internal/prometheus/prometheus.go @@ -114,17 +114,21 @@ func buildLabelData(val interface{}, m config.Metric) (map[string]string, error) return metric.Labels, error_in_hash } -func (collector *GraphqlCollector) getMetrics(ctx context.Context) ([]Metric, error) { +func (collector *GraphqlCollector) getMetrics() ([]Metric, error) { var gql *Graphql var metrics []Metric for _, q := range config.Config.Queries { + ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(config.Config.QueryTimeout)) result, err := graphql.GraphqlQuery(ctx, q.Query) + cancel() if err != nil { - return nil, fmt.Errorf("query error: %s", err) + slog.Error(fmt.Sprintf("query error: %s", err)) + continue } err = json.Unmarshal(result, &gql) if err != nil { - return nil, fmt.Errorf("unmarshal error: %s", err) + slog.Error(fmt.Sprintf("unmarshal error: %s", err)) + continue } data := gql.Data.(map[string]interface{}) for _, m := range q.Metrics { @@ -137,13 +141,13 @@ func (collector *GraphqlCollector) getMetrics(ctx context.Context) ([]Metric, er // loop through value path from config. extract result metric.ValueName, metric.Value, error_in_hash = buildValueData(val_hash, m.Value) if error_in_hash != nil { - slog.Error(fmt.Sprintf("got error: %s", error_in_hash)) + slog.Error(fmt.Sprintf("metric value build error: %s", error_in_hash)) continue } // loop through labels from config. Build label-value keypairs. metric.Labels, error_in_hash = buildLabelData(val, m) if error_in_hash != nil { - slog.Error(fmt.Sprintf("got error: %s", error_in_hash)) + slog.Error(fmt.Sprintf("metric labels build error: %s", error_in_hash)) continue } metric.Name = config.Config.MetricsPrefix + strings.Replace(m.Value, ",", "_", -1) @@ -158,9 +162,7 @@ func (collector *GraphqlCollector) Describe(ch chan<- *prometheus.Desc) {} func (collector *GraphqlCollector) updateMetrics() error { if time.Now().Unix()-collector.cachedAt > config.Config.CacheExpire { - ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(config.Config.Timeout)) - defer cancel() - metrics, err := collector.getMetrics(ctx) + metrics, err := collector.getMetrics() collector.accessMu.Lock() defer collector.accessMu.Unlock() if err != nil { From 720dee619ce19361fa5528ec924ea9fa689f3979 Mon Sep 17 00:00:00 2001 From: Vaidas Balys Date: Wed, 9 Oct 2024 15:17:24 +0300 Subject: [PATCH 3/3] Configure timeouts and resilience. Query timeout per individual GraphQL query. Configure if should attempt to execute as many as possible, or bail out with the first error. Rename "cache on error" config variable for better clarity. --- config_example.json | 3 ++- internal/config/config.go | 15 ++++++++------- internal/prometheus/prometheus.go | 18 +++++++++++++----- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/config_example.json b/config_example.json index 58637b6..6139e77 100644 --- a/config_example.json +++ b/config_example.json @@ -4,7 +4,8 @@ "graphqlAPIToken": "Token SECRET", "cacheExpire": 0, "queryTimeout": 60, - "retryOnError": false, + "failFast": false, + "extendCacheOnError": false, "queries":[ { "query": "query {device_list {name serial custom_fields}} {{NOW \"-1h\"}}", diff --git a/internal/config/config.go b/internal/config/config.go index 8f1464b..efa86d2 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -8,13 +8,14 @@ import ( ) type Cfg struct { - GraphqlURL string - GraphqlAPIToken string - CacheExpire int64 - QueryTimeout int64 - RetryOnError bool - MetricsPrefix string - Queries []Query + MetricsPrefix string + GraphqlURL string + GraphqlAPIToken string + CacheExpire int64 + QueryTimeout int64 + FailFast bool + ExtendCacheOnError bool + Queries []Query } type Query struct { diff --git a/internal/prometheus/prometheus.go b/internal/prometheus/prometheus.go index 3baafb4..ab1ac2c 100644 --- a/internal/prometheus/prometheus.go +++ b/internal/prometheus/prometheus.go @@ -122,13 +122,21 @@ func (collector *GraphqlCollector) getMetrics() ([]Metric, error) { result, err := graphql.GraphqlQuery(ctx, q.Query) cancel() if err != nil { - slog.Error(fmt.Sprintf("query error: %s", err)) - continue + if config.Config.FailFast { + return nil, err + } else { + slog.Error(fmt.Sprintf("query error: %s", err)) + continue + } } err = json.Unmarshal(result, &gql) if err != nil { - slog.Error(fmt.Sprintf("unmarshal error: %s", err)) - continue + if config.Config.FailFast { + return nil, err + } else { + slog.Error(fmt.Sprintf("unmarshal error: %s", err)) + continue + } } data := gql.Data.(map[string]interface{}) for _, m := range q.Metrics { @@ -167,7 +175,7 @@ func (collector *GraphqlCollector) updateMetrics() error { defer collector.accessMu.Unlock() if err != nil { slog.Error(fmt.Sprintf("error collecting metrics: %s", err)) - if !config.Config.RetryOnError { + if config.Config.ExtendCacheOnError { collector.cachedAt = time.Now().Unix() } return err