diff --git a/platform/overlays/gke/knative/monitoring/grafana/dashboard.config b/platform/overlays/gke/knative/monitoring/grafana/dashboard.config index 9646a2a464..dc6a09c1df 100755 --- a/platform/overlays/gke/knative/monitoring/grafana/dashboard.config +++ b/platform/overlays/gke/knative/monitoring/grafana/dashboard.config @@ -15,25 +15,31 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "id": 2, + "id": 1, "links": [], "panels": [ { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 5, + "h": 6, "w": 8, "x": 0, "y": 0 }, "hiddenSeries": false, - "id": 10, + "id": 16, "legend": { "avg": false, "current": false, @@ -46,10 +52,8 @@ "lines": true, "linewidth": 1, "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "percentage": false, + "pluginVersion": "7.1.3", "pointradius": 2, "points": false, "renderer": "flot", @@ -59,7 +63,7 @@ "steppedLine": false, "targets": [ { - "expr": "queue_requests_per_second{serving_knative_dev_service=\"dispatcher\"}", + "expr": "sum by (destination_configuration) (queue_requests_per_second{destination_configuration=\"scan-queue\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -69,7 +73,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Dispatcher Traffic", + "title": "Scan Queue Traffic", "tooltip": { "shared": true, "sort": 0, @@ -106,22 +110,112 @@ "alignLevel": null } }, + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 22, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "{serving_knative_dev_service=\"scan-queue\"} |= \"INFO\"", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "{serving_knative_dev_service=\"scan-queue\"} |= \"ERROR\"", + "legendFormat": "", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Scan Queue Logs", + "type": "logs" + }, + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 36, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 20, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "{app=\"tracker-api\"} |= \"info\"", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "{app=\"tracker-api\"} |= \"warn\"", + "legendFormat": "", + "refId": "B" + }, + { + "expr": "{app=\"tracker-api\"} |= \"error\"", + "legendFormat": "", + "refId": "C" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "API Logs", + "type": "logs" + }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 5, + "h": 6, "w": 8, - "x": 8, - "y": 0 + "x": 0, + "y": 6 }, "hiddenSeries": false, - "id": 12, + "id": 2, "legend": { "avg": false, "current": false, @@ -134,10 +228,8 @@ "lines": true, "linewidth": 1, "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "percentage": false, + "pluginVersion": "7.1.3", "pointradius": 2, "points": false, "renderer": "flot", @@ -147,7 +239,7 @@ "steppedLine": false, "targets": [ { - "expr": "queue_requests_per_second{serving_knative_dev_service=\"result-processor\"}", + "expr": "sum by (destination_configuration) (queue_requests_per_second{destination_configuration=\"https-scanner\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -157,7 +249,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Result Processor Traffic", + "title": "HTTPS Scanner Traffic", "tooltip": { "shared": true, "sort": 0, @@ -195,12 +287,18 @@ } }, { - "datasource": null, + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, "gridPos": { - "h": 15, + "h": 6, "w": 8, - "x": 16, - "y": 0 + "x": 8, + "y": 6 }, "id": 14, "options": { @@ -209,27 +307,46 @@ "sortOrder": "Descending", "wrapLogMessage": true }, + "targets": [ + { + "expr": "{serving_knative_dev_service=\"https-scanner\"} |= \"INFO\"", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "{serving_knative_dev_service=\"https-scanner\"} |= \"ERROR\"", + "legendFormat": "", + "refId": "B" + } + ], "timeFrom": null, "timeShift": null, - "title": "Logs", + "title": "HTTPS Scanner Logs", "type": "logs" }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 5, + "h": 6, "w": 8, "x": 0, - "y": 5 + "y": 12 }, "hiddenSeries": false, - "id": 8, + "id": 4, "legend": { "avg": false, "current": false, @@ -242,10 +359,8 @@ "lines": true, "linewidth": 1, "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "percentage": false, + "pluginVersion": "7.1.3", "pointradius": 2, "points": false, "renderer": "flot", @@ -255,7 +370,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(http_inprogress_requests{serving_knative_dev_service=\"dkim-scanner\"})", + "expr": "sum by (destination_configuration) (queue_requests_per_second{destination_configuration=\"ssl-scanner\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -265,7 +380,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "DKIM Scanner Traffic", + "title": "SSL Scanner Traffic", "tooltip": { "shared": true, "sort": 0, @@ -302,19 +417,65 @@ "alignLevel": null } }, + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 12 + }, + "id": 24, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "{serving_knative_dev_service=\"ssl-scanner\"} |= \"INFO\"", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "{serving_knative_dev_service=\"ssl-scanner\"} |= \"ERROR\"", + "legendFormat": "", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "SSL Scanner Logs", + "type": "logs" + }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 5, + "h": 6, "w": 8, - "x": 8, - "y": 5 + "x": 0, + "y": 18 }, "hiddenSeries": false, "id": 6, @@ -330,10 +491,8 @@ "lines": true, "linewidth": 1, "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "percentage": false, + "pluginVersion": "7.1.3", "pointradius": 2, "points": false, "renderer": "flot", @@ -343,7 +502,7 @@ "steppedLine": false, "targets": [ { - "expr": "queue_requests_per_second{serving_knative_dev_service=\"dmarc-scanner\"}", + "expr": "sum by (destination_configuration) (queue_requests_per_second{destination_configuration=\"dns-scanner\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -353,7 +512,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "DMARC Scanner Traffic", + "title": "DNS Scanner Traffic", "tooltip": { "shared": true, "sort": 0, @@ -390,22 +549,67 @@ "alignLevel": null } }, + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 18 + }, + "id": 26, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "{serving_knative_dev_service=\"dns-scanner\"} |= \"INFO\"", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "{serving_knative_dev_service=\"dns-scanner\"} |= \"ERROR\"", + "legendFormat": "", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "DNS Scanner Logs", + "type": "logs" + }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 5, + "h": 6, "w": 8, "x": 0, - "y": 10 + "y": 24 }, "hiddenSeries": false, - "id": 4, + "id": 18, "legend": { "avg": false, "current": false, @@ -418,10 +622,8 @@ "lines": true, "linewidth": 1, "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "percentage": false, + "pluginVersion": "7.1.3", "pointradius": 2, "points": false, "renderer": "flot", @@ -431,7 +633,7 @@ "steppedLine": false, "targets": [ { - "expr": "queue_requests_per_second{serving_knative_dev_service=\"ssl-scanner\"}", + "expr": "sum by (destination_configuration) (queue_requests_per_second{destination_configuration=\"result-queue\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -441,7 +643,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "SSL Scanner Traffic", + "title": "Result Queue Traffic", "tooltip": { "shared": true, "sort": 0, @@ -478,22 +680,68 @@ "alignLevel": null } }, + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 24 + }, + "id": 28, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "{serving_knative_dev_service=\"result-queue\"} |= \"INFO\"", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "{serving_knative_dev_service=\"result-queue\"} |= \"ERROR\"", + "legendFormat": "", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Result Queue Logs", + "type": "logs" + }, { "aliasColors": {}, - "bars": true, + "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { - "h": 5, + "h": 6, "w": 8, - "x": 8, - "y": 10 + "x": 0, + "y": 30 }, "hiddenSeries": false, - "id": 2, + "id": 12, "legend": { "avg": false, "current": false, @@ -506,10 +754,8 @@ "lines": true, "linewidth": 1, "nullPointMode": "null", - "options": { - "dataLinks": [] - }, "percentage": false, + "pluginVersion": "7.1.3", "pointradius": 2, "points": false, "renderer": "flot", @@ -519,7 +765,7 @@ "steppedLine": false, "targets": [ { - "expr": "queue_requests_per_second{serving_knative_dev_service=\"https-scanner\"}", + "expr": "sum by (destination_configuration) (queue_requests_per_second{destination_configuration=\"result-processor\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -529,7 +775,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "HTTPS Scanner Traffic", + "title": "Result Processor Traffic", "tooltip": { "shared": true, "sort": 0, @@ -565,9 +811,48 @@ "align": false, "alignLevel": null } + }, + { + "datasource": "Loki", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 30 + }, + "id": 30, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "7.1.3", + "targets": [ + { + "expr": "{serving_knative_dev_service=\"result-processor\"} |= \"INFO\"", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "{serving_knative_dev_service=\"result-processor\"} |= \"ERROR\"", + "legendFormat": "", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Result Processor Logs", + "type": "logs" } ], - "schemaVersion": 22, + "schemaVersion": 26, "style": "dark", "tags": [], "templating": { @@ -594,8 +879,5 @@ "timezone": "", "title": "Scanners", "uid": "xcD_46jZk", - "variables": { - "list": [] - }, - "version": 2 + "version": 4 } diff --git a/platform/overlays/gke/knative/monitoring/prometheus.yaml b/platform/overlays/gke/knative/monitoring/prometheus.yaml index 23717a01d5..a4f2ec1152 100755 --- a/platform/overlays/gke/knative/monitoring/prometheus.yaml +++ b/platform/overlays/gke/knative/monitoring/prometheus.yaml @@ -86,70 +86,70 @@ data: severity: Info annotations: summary: SSL Scanner is not running - - name: DMARC Alert + - name: DNS Alert rules: - - alert: DMARCScannerHighTraffic - expr: queue_requests_per_second{serving_knative_dev_service="dmarc-scanner"} > 10 + - alert: DNSScannerHighTraffic + expr: queue_requests_per_second{serving_knative_dev_service="dns-scanner"} > 10 for: 20s labels: severity: Medium annotations: - summary: DMARC Scanner experiencing high volume of traffic - - alert: DMARCScannerDown - expr: up{serving_knative_dev_service="dmarc-scanner"} == 0 + summary: DNS Scanner experiencing high volume of traffic + - alert: DNSScannerDown + expr: up{serving_knative_dev_service="dns-scanner"} == 0 for: 5s labels: severity: Info annotations: - summary: DMARC Scanner is not running - - name: DKIM Alert + summary: DNS Scanner is not running + - name: Results Alert rules: - - alert: DKIMScannerHighTraffic - expr: queue_requests_per_second{serving_knative_dev_service="dkim-scanner"} > 10 + - alert: ResultProcessorHighTraffic + expr: queue_requests_per_second{serving_knative_dev_service="result-processor"} > 10 for: 20s labels: severity: Medium annotations: - summary: DKIM Scanner experiencing high volume of traffic - - alert: DKIMScannerDown - expr: up{serving_knative_dev_service="dkim-scanner"} == 0 + summary: Result Processor experiencing high volume of traffic + - alert: ResultProcessorDown + expr: up{serving_knative_dev_service="result-processor"} == 0 for: 5s labels: severity: Info annotations: - summary: DKIM Scanner is not running - - name: Dispatcher Alert + summary: Result Processor is not running + - name: Scan Queue Alert rules: - - alert: DispatcherHighTraffic - expr: queue_requests_per_second{serving_knative_dev_service="dispatcher"} > 10 + - alert: ScanQueueHighTraffic + expr: queue_requests_per_second{serving_knative_dev_service="scan-queue"} > 10 for: 20s labels: severity: Medium annotations: - summary: Dispatcher experiencing high volume of traffic - - alert: DispatcherDown - expr: up{serving_knative_dev_service="dispatcher"} == 0 + summary: Scan Queue experiencing high volume of traffic + - alert: ScanQueueDown + expr: up{serving_knative_dev_service="scan-queue"} == 0 for: 5s labels: severity: Info annotations: - summary: Dispatcher is not running - - name: Results Alert + summary: Scan Queue is not running + - name: Result Queue Alert rules: - - alert: ResultProcessorHighTraffic - expr: queue_requests_per_second{serving_knative_dev_service="result-processor"} > 10 + - alert: ResultQueueHighTraffic + expr: queue_requests_per_second{serving_knative_dev_service="result-queue"} > 10 for: 20s labels: severity: Medium annotations: - summary: Result Processor experiencing high volume of traffic - - alert: ResultProcessorDown - expr: up{serving_knative_dev_service="result-processor"} == 0 + summary: Result Queue experiencing high volume of traffic + - alert: ResultQueueDown + expr: up{serving_knative_dev_service="result-queue"} == 0 for: 5s labels: severity: Info annotations: - summary: Result Processor is not running + summary: Result Queue is not running prometheus.yml: |- global: scrape_interval: 5s @@ -170,8 +170,8 @@ data: - role: pod relabel_configs: - - source_labels: [__meta_kubernetes_pod_label_app] - regex: scanners + - source_labels: [__meta_kubernetes_namespace] + regex: scanners|api action: keep - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep