Refactoring
This commit is contained in:
408
kubernetes/afterwork-monitoring.yaml
Normal file
408
kubernetes/afterwork-monitoring.yaml
Normal file
@@ -0,0 +1,408 @@
|
||||
# ==============================================================================
|
||||
# AfterWork API - Configuration Monitoring pour Lions Infrastructure
|
||||
# ==============================================================================
|
||||
# Cette configuration intègre l'application avec:
|
||||
# - Prometheus (https://prometheus.lions.dev) - scraping auto via annotations
|
||||
# - Grafana (https://grafana.lions.dev) - dashboard dédié
|
||||
# ==============================================================================
|
||||
|
||||
---
|
||||
# ==============================================================================
|
||||
# ServiceMonitor pour Prometheus Operator (si installé)
|
||||
# ==============================================================================
|
||||
# Note: L'infrastructure Lions utilise le scraping via annotations pod, mais
|
||||
# ce ServiceMonitor peut être utilisé si Prometheus Operator est déployé.
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: afterwork-api-monitor
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: mic-after-work-server-impl-quarkus-main
|
||||
release: prometheus
|
||||
project: lions-infrastructure-2025
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: mic-after-work-server-impl-quarkus-main
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- applications
|
||||
endpoints:
|
||||
- port: http-direct
|
||||
path: /afterwork/q/metrics
|
||||
interval: 30s
|
||||
scrapeTimeout: 10s
|
||||
scheme: http
|
||||
|
||||
---
|
||||
# ==============================================================================
|
||||
# PrometheusRule - Alertes pour AfterWork API
|
||||
# ==============================================================================
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: afterwork-api-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: mic-after-work-server-impl-quarkus-main
|
||||
release: prometheus
|
||||
project: lions-infrastructure-2025
|
||||
spec:
|
||||
groups:
|
||||
- name: afterwork-api.rules
|
||||
rules:
|
||||
# Alerte si l'application est down
|
||||
- alert: AfterWorkAPIDown
|
||||
expr: up{job=~".*afterwork.*"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
application: afterwork-api
|
||||
annotations:
|
||||
summary: "AfterWork API is down"
|
||||
description: "L'API AfterWork n'est pas accessible depuis plus de 2 minutes"
|
||||
|
||||
# Alerte si le taux d'erreur HTTP 5xx est élevé
|
||||
- alert: AfterWorkHighErrorRate
|
||||
expr: |
|
||||
sum(rate(http_server_requests_seconds_count{
|
||||
kubernetes_namespace="applications",
|
||||
app="mic-after-work-server-impl-quarkus-main",
|
||||
status=~"5.."
|
||||
}[5m])) /
|
||||
sum(rate(http_server_requests_seconds_count{
|
||||
kubernetes_namespace="applications",
|
||||
app="mic-after-work-server-impl-quarkus-main"
|
||||
}[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
application: afterwork-api
|
||||
annotations:
|
||||
summary: "High error rate on AfterWork API"
|
||||
description: "Le taux d'erreur 5xx est supérieur à 5% depuis 5 minutes"
|
||||
|
||||
# Alerte si la latence p95 est élevée
|
||||
- alert: AfterWorkHighLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{
|
||||
kubernetes_namespace="applications",
|
||||
app="mic-after-work-server-impl-quarkus-main"
|
||||
}[5m])) by (le)) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
application: afterwork-api
|
||||
annotations:
|
||||
summary: "High latency on AfterWork API"
|
||||
description: "La latence p95 dépasse 2 secondes depuis 5 minutes"
|
||||
|
||||
# Alerte si la mémoire est proche de la limite
|
||||
- alert: AfterWorkHighMemoryUsage
|
||||
expr: |
|
||||
sum(container_memory_working_set_bytes{
|
||||
namespace="applications",
|
||||
pod=~"mic-after-work-server-impl-quarkus-main.*"
|
||||
}) /
|
||||
sum(container_spec_memory_limit_bytes{
|
||||
namespace="applications",
|
||||
pod=~"mic-after-work-server-impl-quarkus-main.*"
|
||||
}) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
application: afterwork-api
|
||||
annotations:
|
||||
summary: "High memory usage on AfterWork API"
|
||||
description: "L'utilisation mémoire dépasse 85% de la limite"
|
||||
|
||||
# Alerte si le pod redémarre fréquemment
|
||||
- alert: AfterWorkPodRestarts
|
||||
expr: |
|
||||
increase(kube_pod_container_status_restarts_total{
|
||||
namespace="applications",
|
||||
pod=~"mic-after-work-server-impl-quarkus-main.*"
|
||||
}[1h]) > 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
application: afterwork-api
|
||||
annotations:
|
||||
summary: "AfterWork API pod restarting frequently"
|
||||
description: "Le pod a redémarré plus de 3 fois dans la dernière heure"
|
||||
|
||||
---
|
||||
# ==============================================================================
|
||||
# Grafana Dashboard ConfigMap (pour import automatique)
|
||||
# ==============================================================================
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: afterwork-grafana-dashboard
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
app: mic-after-work-server-impl-quarkus-main
|
||||
project: lions-infrastructure-2025
|
||||
data:
|
||||
afterwork-api-dashboard.json: |
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 100},
|
||||
{"color": "red", "value": 500}
|
||||
]
|
||||
},
|
||||
"unit": "reqps"
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
"id": 1,
|
||||
"options": {},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_server_requests_seconds_count{kubernetes_namespace=\"applications\",app=\"mic-after-work-server-impl-quarkus-main\"}[5m]))",
|
||||
"legendFormat": "Requests/s",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "ms"
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"id": 2,
|
||||
"options": {},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{kubernetes_namespace=\"applications\",app=\"mic-after-work-server-impl-quarkus-main\"}[5m])) by (le)) * 1000",
|
||||
"legendFormat": "p95 Latency",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_server_requests_seconds_bucket{kubernetes_namespace=\"applications\",app=\"mic-after-work-server-impl-quarkus-main\"}[5m])) by (le)) * 1000",
|
||||
"legendFormat": "p50 Latency",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Response Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "percent"
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
"id": 3,
|
||||
"options": {},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_server_requests_seconds_count{kubernetes_namespace=\"applications\",app=\"mic-after-work-server-impl-quarkus-main\",status=~\"5..\"}[5m])) / sum(rate(http_server_requests_seconds_count{kubernetes_namespace=\"applications\",app=\"mic-after-work-server-impl-quarkus-main\"}[5m])) * 100",
|
||||
"legendFormat": "Error Rate %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "bytes"
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
"id": 4,
|
||||
"options": {},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(container_memory_working_set_bytes{namespace=\"applications\",pod=~\"mic-after-work-server-impl-quarkus-main.*\"})",
|
||||
"legendFormat": "Memory Used",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(container_spec_memory_limit_bytes{namespace=\"applications\",pod=~\"mic-after-work-server-impl-quarkus-main.*\"})",
|
||||
"legendFormat": "Memory Limit",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Memory Usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
||||
"id": 5,
|
||||
"options": {},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"applications\",pod=~\"mic-after-work-server-impl-quarkus-main.*\"}[5m])) * 1000",
|
||||
"legendFormat": "CPU Usage (millicores)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "CPU Usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": null},
|
||||
{"color": "green", "value": 1}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 16},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=~\".*afterwork.*\"}",
|
||||
"legendFormat": "Status",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "API Status",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 1},
|
||||
{"color": "red", "value": 3}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 16},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "increase(kube_pod_container_status_restarts_total{namespace=\"applications\",pod=~\"mic-after-work-server-impl-quarkus-main.*\"}[1h])",
|
||||
"legendFormat": "Restarts (1h)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Pod Restarts (1h)",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["lions", "afterwork", "quarkus", "api"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "AfterWork API Dashboard",
|
||||
"uid": "afterwork-api",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
Reference in New Issue
Block a user