-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update and add callback alarms #1507
Conversation
Updating alarms ⏰? Great! Please update the Google Sheet and add a 👍 to this message after 🙏 |
aws/eks/cloudwatch_log.tf
Outdated
count = var.cloudwatch_enabled ? 1 : 0 | ||
name = "callback-max-retry-failures" | ||
pattern = "send_delivery_status_to_service has retried the max num of times for callback url" | ||
name = "allback-request-failures" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
name = "callback-request-failures"
Staging: eks✅ Terraform Init: Plan: 4 to add, 1 to change, 2 to destroy Show summary
Show planResource actions are indicated with the following symbols:
+ create
~ update in-place
- destroy
Terraform will perform the following actions:
# aws_cloudwatch_log_metric_filter.callback-max-retry-failures[0] will be destroyed
# (because aws_cloudwatch_log_metric_filter.callback-max-retry-failures is not in configuration)
- resource "aws_cloudwatch_log_metric_filter" "callback-max-retry-failures" {
- id = "callback-max-retry-failures" -> null
- log_group_name = "/aws/containerinsights/notification-canada-ca-staging-eks-cluster/application" -> null
- name = "callback-max-retry-failures" -> null
- pattern = "send_delivery_status_to_service has retried the max num of times for callback url" -> null
- metric_transformation {
- dimensions = {} -> null
- name = "callback-max-retry-failures" -> null
- namespace = "LogMetrics" -> null
- unit = "None" -> null
- value = "1" -> null
}
}
# aws_cloudwatch_log_metric_filter.callback-request-failures[0] will be created
+ resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" {
+ id = (known after apply)
+ log_group_name = "/aws/containerinsights/notification-canada-ca-staging-eks-cluster/application"
+ name = "callback-request-failures"
+ pattern = "send_delivery_status_to_service request failed for notification_id:"
+ metric_transformation {
+ name = "callback-max-retry-failures"
+ namespace = "LogMetrics"
+ unit = "None"
+ value = "1"
}
}
# aws_cloudwatch_metric_alarm.service-callback-too-many-failures-critical[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-critical" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-critical",
]
+ alarm_description = "Service reached the max number of callback retries 100 times in 10 minutes"
+ alarm_name = "service-callback-too-many-failures-warning"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 1
+ id = (known after apply)
+ metric_name = "callback-max-retry-failures"
+ namespace = "LogMetrics"
+ period = 600
+ statistic = "Sum"
+ tags_all = (known after apply)
+ threshold = 100
+ treat_missing_data = "notBreaching"
}
# aws_cloudwatch_metric_alarm.service-callback-too-many-failures-warning[0] will be updated in-place
~ resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warning" {
~ alarm_description = "Service reached the max number of callback retries 5 times in 30 minutes" -> "Service reached the max number of callback retries 25 times in 5 minutes"
id = "service-callback-too-many-failures-warning"
~ period = 1800 -> 300
tags = {}
~ threshold = 5 -> 25
# (15 unchanged attributes hidden)
}
# aws_cloudwatch_query_definition.callback-failures[0] will be created
+ resource "aws_cloudwatch_query_definition" "callback-failures" {
+ id = (known after apply)
+ log_group_names = [
+ "/aws/containerinsights/notification-canada-ca-staging-eks-cluster/application",
]
+ name = "Callbacks / Callback errors by notification_id"
+ query_definition_id = (known after apply)
+ query_string = <<-EOT
fields @timestamp, @notification_id, @url, @error
| filter kubernetes.container_name like /^celery/
| filter @message like /send_delivery_status_to_service request failed for notification_id:/
| parse @message 'send_delivery_status_to_service request failed for notification_id: * and url: * exc: *' as @notification_id, @url, @error
| limit 10000
EOT
}
# aws_cloudwatch_query_definition.callback-failures-by-service[0] will be destroyed
# (because aws_cloudwatch_query_definition.callback-failures-by-service is not in configuration)
- resource "aws_cloudwatch_query_definition" "callback-failures-by-service" {
- id = "165cd51c-831b-45bc-97bc-e5268f0da600" -> null
- log_group_names = [
- "/aws/containerinsights/notification-canada-ca-staging-eks-cluster/application",
] -> null
- name = "Callbacks / Count of callbacks that exceeded MaxRetries by service" -> null
- query_definition_id = "165cd51c-831b-45bc-97bc-e5268f0da600" -> null
- query_string = <<-EOT
fields @timestamp, @service_id, @callback_url, @notification_id
| filter kubernetes.container_name like /^celery/
| filter @message like /send_delivery_status_to_service has retried the max num of times for callback url/
| parse @message 'Retry: send_delivery_status_to_service has retried the max num of times for callback url * and notification_id: * for service: *' as @callback_url, @notification_id, @service_id
| sort @timestamp desc
| stats count(@service_id) by @service_id, bin(30m)
| limit 10000
EOT -> null
}
# aws_cloudwatch_query_definition.callback-max-retry-failures-by-service[0] will be created
+ resource "aws_cloudwatch_query_definition" "callback-max-retry-failures-by-service" {
+ id = (known after apply)
+ log_group_names = [
+ "/aws/containerinsights/notification-canada-ca-staging-eks-cluster/application",
]
+ name = "Callbacks / Callbacks that exceeded MaxRetries by service"
+ query_definition_id = (known after apply)
+ query_string = <<-EOT
fields @timestamp, @service_id, @callback_url, @notification_id
| filter kubernetes.container_name like /^celery/
| filter @message like /send_delivery_status_to_service has retried the max num of times for callback url/
| parse @message 'Retry: send_delivery_status_to_service has retried the max num of times for callback url * and notification_id: * for service: *' as @callback_url, @notification_id, @service_id
| sort @timestamp desc
| stats count(@service_id) by @service_id, bin(30m)
| limit 10000
EOT
}
Plan: 4 to add, 1 to change, 2 to destroy.
─────────────────────────────────────────────────────────────────────────────
Saved the plan to: plan.tfplan
To perform exactly these actions, run the following command to apply:
terraform apply "plan.tfplan"
Show Conftest resultsWARN - plan.json - main - Cloudwatch log metric pattern is invalid: ["aws_cloudwatch_log_metric_filter.callback-request-failures[0]"]
WARN - plan.json - main - Cloudwatch log metric pattern is invalid: ["aws_cloudwatch_log_metric_filter.celery-error[0]"]
WARN - plan.json - main - Cloudwatch log metric pattern is invalid: ["aws_cloudwatch_log_metric_filter.scanfiles-timeout[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_acm_certificate.client_vpn"]
WARN - plan.json - main - Missing Common Tags: ["aws_acm_certificate.notification-canada-ca"]
WARN - plan.json - main - Missing Common Tags: ["aws_acm_certificate.notification-canada-ca-alt[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb.notification-canada-ca"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_listener.internal_alb_tls"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_listener.notification-canada-ca"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.internal_nginx_http"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-admin"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-api"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-document"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-document-api"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-documentation"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.notification-canada-ca-eks-cluster-logs[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.admin-evicted-pods[0]"]
WARN -... |
Summary | Résumé
This PR adds
notification_id
and error reasonRelated Issues | Cartes liées
Test instructions | Instructions pour tester la modification
TF plan passes
Release Instructions | Instructions pour le déploiement
None.
Reviewer checklist | Liste de vérification du réviseur