监控指标与告警体系

核心监控指标 流量指标 指标 说明 单位 passQps 通过QPS 次/秒 blockQps 限流QPS 次/秒 successQps 成功QPS 次/秒 exceptionQps 异常QPS 次/秒 avgRt 平均响应时间 毫秒 minRt 最小响应时间 毫秒 maxConcurrency 最大并发数 个 规则指标 指标 说明 flowRuleCount 流控规则数量 degradeRuleCount 熔断规则数量 systemRuleCount 系统规则数量 authorityRuleCount 授权规则数量 系统指标 指标 说明 systemLoad 系统负载 cpuUsage CPU使用率 memoryUsage 内存使用率 threadCount 线程数 Prometheus集成 暴露指标 <dependency> <groupId>io.micrometer</groupId> <artifactId>micrometer-registry-prometheus</artifactId> </dependency> management: endpoints: web: exposure: include: '*' metrics: export: prometheus: enabled: true 自定义指标 @Component public class SentinelMetrics { private final MeterRegistry meterRegistry; public SentinelMetrics(MeterRegistry meterRegistry) { this.meterRegistry = meterRegistry; // 注册指标采集 Metrics.addRegistry(meterRegistry); // 定时采集Sentinel指标 Executors.newScheduledThreadPool(1).scheduleAtFixedRate(() -> { collectMetrics(); }, 0, 10, TimeUnit.SECONDS); } private void collectMetrics() { Map<String, Node> nodeMap = ClusterBuilderSlot.getClusterNodeMap(); for (Map.Entry<String, ClusterNode> entry : nodeMap.entrySet()) { String resource = entry.getKey(); ClusterNode node = entry.getValue(); // 通过QPS Gauge.builder("sentinel.pass.qps", node, Node::passQps) .tag("resource", resource) .register(meterRegistry); // 限流QPS Gauge.builder("sentinel.block.qps", node, Node::blockQps) .tag("resource", resource) .register(meterRegistry); // 平均RT Gauge.builder("sentinel.avg.rt", node, Node::avgRt) .tag("resource", resource) .register(meterRegistry); } } } Prometheus配置 # prometheus.yml scrape_configs: - job_name: 'sentinel' metrics_path: '/actuator/prometheus' static_configs: - targets: ['localhost:8080'] Grafana Dashboard Dashboard JSON { "dashboard": { "title": "Sentinel Monitoring", "panels": [ { "title": "QPS", "type": "graph", "targets": [ { "expr": "rate(sentinel_pass_qps{resource=\"orderCreate\"}[1m])", "legendFormat": "通过" }, { "expr": "rate(sentinel_block_qps{resource=\"orderCreate\"}[1m])", "legendFormat": "限流" } ] }, { "title": "限流比例", "type": "graph", "targets": [ { "expr": "rate(sentinel_block_qps[1m]) / (rate(sentinel_pass_qps[1m]) + rate(sentinel_block_qps[1m]))", "legendFormat": "{{resource}}" } ] }, { "title": "平均RT", "type": "graph", "targets": [ { "expr": "sentinel_avg_rt", "legendFormat": "{{resource}}" } ] } ] } } 告警规则 Prometheus Alertmanager # alert_rules.yml groups: - name: sentinel_alerts rules: # 限流频繁告警 - alert: HighBlockRate expr: | rate(sentinel_block_qps[1m]) / (rate(sentinel_pass_qps[1m]) + rate(sentinel_block_qps[1m])) > 0.1 for: 5m labels: severity: warning annotations: summary: "限流比例过高" description: "资源 {{ $labels.resource }} 限流比例超过10%" # 熔断告警 - alert: CircuitBreakerOpen expr: sentinel_circuit_breaker_state == 1 for: 1m labels: severity: critical annotations: summary: "熔断器开启" description: "资源 {{ $labels.resource }} 熔断器已开启" # RT过高告警 - alert: HighAvgRT expr: sentinel_avg_rt > 1000 for: 5m labels: severity: warning annotations: summary: "平均RT过高" description: "资源 {{ $labels.resource }} 平均RT超过1秒" # 系统负载过高 - alert: HighSystemLoad expr: system_load > 4.0 for: 2m labels: severity: critical annotations: summary: "系统负载过高" description: "系统负载为 {{ $value }}" Alertmanager配置 # alertmanager.yml global: resolve_timeout: 5m route: group_by: ['alertname', 'resource'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'default' receivers: - name: 'default' webhook_configs: - url: 'http://localhost:8060/webhook' email_configs: - to: 'ops@example.com' from: 'alert@example.com' smarthost: 'smtp.example.com:587' 自定义告警 钉钉告警 @Component public class DingTalkAlerter { private static final String WEBHOOK_URL = "https://oapi.dingtalk.com/robot/send?access_token=xxx"; public void sendAlert(String title, String content) { Map<String, Object> message = new HashMap<>(); message.put("msgtype", "markdown"); Map<String, String> markdown = new HashMap<>(); markdown.put("title", title); markdown.put("text", content); message.put("markdown", markdown); RestTemplate restTemplate = new RestTemplate(); restTemplate.postForObject(WEBHOOK_URL, message, String.class); } } @Component public class SentinelAlerter { @Autowired private DingTalkAlerter dingTalkAlerter; private final AtomicLong blockCount = new AtomicLong(0); @PostConstruct public void init() { // 定时检查限流次数 Executors.newScheduledThreadPool(1).scheduleAtFixedRate(() -> { checkBlockCount(); }, 0, 1, TimeUnit.MINUTES); } private void checkBlockCount() { long currentBlock = blockCount.get(); if (currentBlock > 1000) { // 1分钟限流超过1000次 String content = String.format( "### Sentinel告警\\n" + "- **告警类型**: 限流频繁\\n" + "- **限流次数**: %d次/分钟\\n" + "- **告警时间**: %s", currentBlock, LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) ); dingTalkAlerter.sendAlert("Sentinel限流告警", content); } blockCount.set(0); // 重置计数 } public void recordBlock() { blockCount.incrementAndGet(); } } 日志监控 结构化日志 @Aspect @Component public class SentinelLogAspect { private static final Logger log = LoggerFactory.getLogger(SentinelLogAspect.class); @Around("@annotation(sentinelResource)") public Object around(ProceedingJoinPoint pjp, SentinelResource sentinelResource) throws Throwable { String resource = sentinelResource.value(); long start = System.currentTimeMillis(); try { Object result = pjp.proceed(); long cost = System.currentTimeMillis() - start; // 记录成功日志 log.info("resource={}, status=success, cost={}ms", resource, cost); return result; } catch (BlockException e) { // 记录限流日志 log.warn("resource={}, status=blocked, reason={}", resource, e.getRule()); throw e; } catch (Exception e) { // 记录异常日志 log.error("resource={}, status=error, message={}", resource, e.getMessage()); throw e; } } } ELK日志分析 { "logstash": { "input": { "file": { "path": "/var/log/sentinel/*.log", "type": "sentinel" } }, "filter": { "grok": { "match": { "message": "resource=%{WORD:resource}, status=%{WORD:status}, cost=%{NUMBER:cost}ms" } } }, "output": { "elasticsearch": { "hosts": ["localhost:9200"], "index": "sentinel-%{+YYYY.MM.dd}" } } } } 监控大盘 关键指标展示 ┌─────────────────────────────────────┐ │ Sentinel监控大盘 │ ├─────────────────────────────────────┤ │ 实时QPS: 1,234 / 10,000 │ │ 限流比例: 2.3% │ │ 平均RT: 45ms │ │ 熔断器状态: 正常 ✅ │ ├─────────────────────────────────────┤ │ TOP 5 限流资源: │ │ 1. orderCreate 123次 │ │ 2. paymentPay 89次 │ │ 3. productQuery 56次 │ ├─────────────────────────────────────┤ │ 告警记录: │ │ [15:30] 订单服务限流频繁 │ │ [14:20] 商品服务熔断 │ └─────────────────────────────────────┘ 总结 监控告警体系: ...

2025-11-20 · maneng

监控告警体系:构建Redis可观测性

核心监控指标 1. 性能指标 @Component public class RedisMetricsCollector { @Autowired private RedisTemplate<String, String> redis; // QPS(每秒查询数) public long getQPS() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("stats") ); return Long.parseLong(info.getProperty("instantaneous_ops_per_sec")); } // 延迟 public long getLatency() { long start = System.currentTimeMillis(); redis.opsForValue().get("health_check"); return System.currentTimeMillis() - start; } // 命中率 public double getHitRate() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("stats") ); long hits = Long.parseLong(info.getProperty("keyspace_hits")); long misses = Long.parseLong(info.getProperty("keyspace_misses")); if (hits + misses == 0) { return 0; } return hits * 100.0 / (hits + misses); } // 慢查询数量 public long getSlowlogCount() { return redis.execute((RedisCallback<Long>) connection -> connection.slowlogLen() ); } } 2. 资源指标 // 内存使用 public Map<String, Object> getMemoryMetrics() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("memory") ); Map<String, Object> metrics = new HashMap<>(); metrics.put("used_memory", Long.parseLong(info.getProperty("used_memory"))); metrics.put("used_memory_rss", Long.parseLong(info.getProperty("used_memory_rss"))); metrics.put("mem_fragmentation_ratio", Double.parseDouble(info.getProperty("mem_fragmentation_ratio"))); metrics.put("evicted_keys", Long.parseLong(info.getProperty("evicted_keys"))); return metrics; } // CPU使用 public double getCPUUsage() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("cpu") ); return Double.parseDouble(info.getProperty("used_cpu_sys")); } // 连接数 public Map<String, Long> getConnectionMetrics() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("clients") ); Map<String, Long> metrics = new HashMap<>(); metrics.put("connected_clients", Long.parseLong(info.getProperty("connected_clients"))); metrics.put("blocked_clients", Long.parseLong(info.getProperty("blocked_clients"))); return metrics; } 3. 持久化指标 public Map<String, Object> getPersistenceMetrics() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("persistence") ); Map<String, Object> metrics = new HashMap<>(); // RDB metrics.put("rdb_last_save_time", Long.parseLong(info.getProperty("rdb_last_save_time"))); metrics.put("rdb_changes_since_last_save", Long.parseLong(info.getProperty("rdb_changes_since_last_save"))); // AOF if ("1".equals(info.getProperty("aof_enabled"))) { metrics.put("aof_current_size", Long.parseLong(info.getProperty("aof_current_size"))); metrics.put("aof_base_size", Long.parseLong(info.getProperty("aof_base_size"))); } return metrics; } 4. 复制指标 public Map<String, Object> getReplicationMetrics() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("replication") ); Map<String, Object> metrics = new HashMap<>(); metrics.put("role", info.getProperty("role")); if ("master".equals(info.getProperty("role"))) { metrics.put("connected_slaves", Integer.parseInt(info.getProperty("connected_slaves"))); } else { metrics.put("master_link_status", info.getProperty("master_link_status")); metrics.put("master_last_io_seconds_ago", Integer.parseInt(info.getProperty("master_last_io_seconds_ago"))); } return metrics; } 告警规则 1. 性能告警 @Component public class PerformanceAlerting { @Autowired private RedisMetricsCollector metrics; @Scheduled(fixedRate = 60000) // 每分钟 public void checkPerformance() { // QPS过高 long qps = metrics.getQPS(); if (qps > 50000) { sendAlert("QPS告警", String.format("当前QPS: %d", qps)); } // 延迟过高 long latency = metrics.getLatency(); if (latency > 100) { sendAlert("延迟告警", String.format("当前延迟: %dms", latency)); } // 命中率过低 double hitRate = metrics.getHitRate(); if (hitRate < 80) { sendAlert("命中率告警", String.format("当前命中率: %.2f%%", hitRate)); } // 慢查询过多 long slowlogCount = metrics.getSlowlogCount(); if (slowlogCount > 100) { sendAlert("慢查询告警", String.format("慢查询数量: %d", slowlogCount)); } } private void sendAlert(String title, String message) { log.warn("{}: {}", title, message); // 发送钉钉/邮件/短信告警 } } 2. 资源告警 @Scheduled(fixedRate = 60000) public void checkResources() { // 内存使用 Map<String, Object> memMetrics = metrics.getMemoryMetrics(); long usedMemory = (long) memMetrics.get("used_memory"); long maxMemory = 4L * 1024 * 1024 * 1024; // 4GB if (usedMemory > maxMemory * 0.9) { sendAlert("内存告警", String.format("内存使用: %dMB / %dMB", usedMemory / 1024 / 1024, maxMemory / 1024 / 1024)); } // 内存碎片率 double fragRatio = (double) memMetrics.get("mem_fragmentation_ratio"); if (fragRatio > 1.5) { sendAlert("内存碎片告警", String.format("碎片率: %.2f", fragRatio)); } // 连接数 Map<String, Long> connMetrics = metrics.getConnectionMetrics(); long connectedClients = connMetrics.get("connected_clients"); if (connectedClients > 1000) { sendAlert("连接数告警", String.format("当前连接数: %d", connectedClients)); } } 3. 可用性告警 @Scheduled(fixedRate = 10000) // 每10秒 public void checkAvailability() { try { // 健康检查 redis.opsForValue().get("health_check"); } catch (Exception e) { sendAlert("Redis不可用", e.getMessage()); } // 主从复制状态 Map<String, Object> replMetrics = metrics.getReplicationMetrics(); if ("slave".equals(replMetrics.get("role"))) { String linkStatus = (String) replMetrics.get("master_link_status"); if (!"up".equals(linkStatus)) { sendAlert("主从复制断开", "master_link_status: " + linkStatus); } int lastIO = (int) replMetrics.get("master_last_io_seconds_ago"); if (lastIO > 60) { sendAlert("主从复制延迟", String.format("最后同步时间: %d秒前", lastIO)); } } } Prometheus + Grafana监控 1. 安装Redis Exporter docker run -d \ --name redis-exporter \ -p 9121:9121 \ oliver006/redis_exporter:latest \ --redis.addr=redis://redis:6379 2. Prometheus配置 # prometheus.yml scrape_configs: - job_name: 'redis' static_configs: - targets: ['redis-exporter:9121'] 3. Grafana Dashboard 导入官方Dashboard: ...

2025-01-22 · maneng

如约数科科技工作室

浙ICP备2025203501号

👀 本站总访问量 ...| 👤 访客数 ...| 📅 今日访问 ...