监控 | 如约数科

监控指标与告警体系

核心监控指标流量指标指标说明单位 passQps 通过QPS 次/秒 blockQps 限流QPS 次/秒 successQps 成功QPS 次/秒 exceptionQps 异常QPS 次/秒 avgRt 平均响应时间毫秒 minRt 最小响应时间毫秒 maxConcurrency 最大并发数个规则指标指标说明 flowRuleCount 流控规则数量 degradeRuleCount 熔断规则数量 systemRuleCount 系统规则数量 authorityRuleCount 授权规则数量系统指标指标说明 systemLoad 系统负载 cpuUsage CPU使用率 memoryUsage 内存使用率 threadCount 线程数 Prometheus集成暴露指标 <dependency> <groupId>io.micrometer</groupId> <artifactId>micrometer-registry-prometheus</artifactId> </dependency> management: endpoints: web: exposure: include: '*' metrics: export: prometheus: enabled: true 自定义指标 @Component public class SentinelMetrics { private final MeterRegistry meterRegistry; public SentinelMetrics(MeterRegistry meterRegistry) { this.meterRegistry = meterRegistry; // 注册指标采集 Metrics.addRegistry(meterRegistry); // 定时采集Sentinel指标 Executors.newScheduledThreadPool(1).scheduleAtFixedRate(() -> { collectMetrics(); }, 0, 10, TimeUnit.SECONDS); } private void collectMetrics() { Map<String, Node> nodeMap = ClusterBuilderSlot.getClusterNodeMap(); for (Map.Entry<String, ClusterNode> entry : nodeMap.entrySet()) { String resource = entry.getKey(); ClusterNode node = entry.getValue(); // 通过QPS Gauge.builder("sentinel.pass.qps", node, Node::passQps) .tag("resource", resource) .register(meterRegistry); // 限流QPS Gauge.builder("sentinel.block.qps", node, Node::blockQps) .tag("resource", resource) .register(meterRegistry); // 平均RT Gauge.builder("sentinel.avg.rt", node, Node::avgRt) .tag("resource", resource) .register(meterRegistry); } } } Prometheus配置 # prometheus.yml scrape_configs: - job_name: 'sentinel' metrics_path: '/actuator/prometheus' static_configs: - targets: ['localhost:8080'] Grafana Dashboard Dashboard JSON { "dashboard": { "title": "Sentinel Monitoring", "panels": [ { "title": "QPS", "type": "graph", "targets": [ { "expr": "rate(sentinel_pass_qps{resource=\"orderCreate\"}[1m])", "legendFormat": "通过" }, { "expr": "rate(sentinel_block_qps{resource=\"orderCreate\"}[1m])", "legendFormat": "限流" } ] }, { "title": "限流比例", "type": "graph", "targets": [ { "expr": "rate(sentinel_block_qps[1m]) / (rate(sentinel_pass_qps[1m]) + rate(sentinel_block_qps[1m]))", "legendFormat": "{{resource}}" } ] }, { "title": "平均RT", "type": "graph", "targets": [ { "expr": "sentinel_avg_rt", "legendFormat": "{{resource}}" } ] } ] } } 告警规则 Prometheus Alertmanager # alert_rules.yml groups: - name: sentinel_alerts rules: # 限流频繁告警 - alert: HighBlockRate expr: | rate(sentinel_block_qps[1m]) / (rate(sentinel_pass_qps[1m]) + rate(sentinel_block_qps[1m])) > 0.1 for: 5m labels: severity: warning annotations: summary: "限流比例过高" description: "资源 {{ $labels.resource }} 限流比例超过10%" # 熔断告警 - alert: CircuitBreakerOpen expr: sentinel_circuit_breaker_state == 1 for: 1m labels: severity: critical annotations: summary: "熔断器开启" description: "资源 {{ $labels.resource }} 熔断器已开启" # RT过高告警 - alert: HighAvgRT expr: sentinel_avg_rt > 1000 for: 5m labels: severity: warning annotations: summary: "平均RT过高" description: "资源 {{ $labels.resource }} 平均RT超过1秒" # 系统负载过高 - alert: HighSystemLoad expr: system_load > 4.0 for: 2m labels: severity: critical annotations: summary: "系统负载过高" description: "系统负载为 {{ $value }}" Alertmanager配置 # alertmanager.yml global: resolve_timeout: 5m route: group_by: ['alertname', 'resource'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'default' receivers: - name: 'default' webhook_configs: - url: 'http://localhost:8060/webhook' email_configs: - to: 'ops@example.com' from: 'alert@example.com' smarthost: 'smtp.example.com:587' 自定义告警钉钉告警 @Component public class DingTalkAlerter { private static final String WEBHOOK_URL = "https://oapi.dingtalk.com/robot/send?access_token=xxx"; public void sendAlert(String title, String content) { Map<String, Object> message = new HashMap<>(); message.put("msgtype", "markdown"); Map<String, String> markdown = new HashMap<>(); markdown.put("title", title); markdown.put("text", content); message.put("markdown", markdown); RestTemplate restTemplate = new RestTemplate(); restTemplate.postForObject(WEBHOOK_URL, message, String.class); } } @Component public class SentinelAlerter { @Autowired private DingTalkAlerter dingTalkAlerter; private final AtomicLong blockCount = new AtomicLong(0); @PostConstruct public void init() { // 定时检查限流次数 Executors.newScheduledThreadPool(1).scheduleAtFixedRate(() -> { checkBlockCount(); }, 0, 1, TimeUnit.MINUTES); } private void checkBlockCount() { long currentBlock = blockCount.get(); if (currentBlock > 1000) { // 1分钟限流超过1000次 String content = String.format( "### Sentinel告警\\n" + "- **告警类型**: 限流频繁\\n" + "- **限流次数**: %d次/分钟\\n" + "- **告警时间**: %s", currentBlock, LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME) ); dingTalkAlerter.sendAlert("Sentinel限流告警", content); } blockCount.set(0); // 重置计数 } public void recordBlock() { blockCount.incrementAndGet(); } } 日志监控结构化日志 @Aspect @Component public class SentinelLogAspect { private static final Logger log = LoggerFactory.getLogger(SentinelLogAspect.class); @Around("@annotation(sentinelResource)") public Object around(ProceedingJoinPoint pjp, SentinelResource sentinelResource) throws Throwable { String resource = sentinelResource.value(); long start = System.currentTimeMillis(); try { Object result = pjp.proceed(); long cost = System.currentTimeMillis() - start; // 记录成功日志 log.info("resource={}, status=success, cost={}ms", resource, cost); return result; } catch (BlockException e) { // 记录限流日志 log.warn("resource={}, status=blocked, reason={}", resource, e.getRule()); throw e; } catch (Exception e) { // 记录异常日志 log.error("resource={}, status=error, message={}", resource, e.getMessage()); throw e; } } } ELK日志分析 { "logstash": { "input": { "file": { "path": "/var/log/sentinel/*.log", "type": "sentinel" } }, "filter": { "grok": { "match": { "message": "resource=%{WORD:resource}, status=%{WORD:status}, cost=%{NUMBER:cost}ms" } } }, "output": { "elasticsearch": { "hosts": ["localhost:9200"], "index": "sentinel-%{+YYYY.MM.dd}" } } } } 监控大盘关键指标展示 ┌─────────────────────────────────────┐ │ Sentinel监控大盘 │ ├─────────────────────────────────────┤ │ 实时QPS： 1,234 / 10,000 │ │ 限流比例： 2.3% │ │ 平均RT： 45ms │ │ 熔断器状态：正常 ✅ │ ├─────────────────────────────────────┤ │ TOP 5 限流资源： │ │ 1. orderCreate 123次 │ │ 2. paymentPay 89次 │ │ 3. productQuery 56次 │ ├─────────────────────────────────────┤ │ 告警记录： │ │ [15:30] 订单服务限流频繁 │ │ [14:20] 商品服务熔断 │ └─────────────────────────────────────┘ 总结监控告警体系： ...

线程池监控与调优：从指标监控到性能优化

一、为什么要监控线程池？ 1.1 常见线程池问题 // 线程池配置不当，导致的问题： // 1. 线程数过小：任务堆积 ThreadPoolExecutor pool = new ThreadPoolExecutor( 2, 2, // ❌ 核心线程数太少 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(1000) ); // 结果：任务大量排队，响应慢 // 2. 队列无界：内存溢出 ThreadPoolExecutor pool = new ThreadPoolExecutor( 10, 20, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>() // ❌ 无界队列 ); // 结果：任务无限堆积，OOM // 3. 拒绝策略不当：任务丢失 ThreadPoolExecutor pool = new ThreadPoolExecutor( 10, 20, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(100), new ThreadPoolExecutor.DiscardPolicy() // ❌ 静默丢弃 ); // 结果：任务丢失，业务异常监控目的： ✅ 发现性能瓶颈 ✅ 预防资源耗尽 ✅ 优化参数配置 ✅ 及时告警二、核心监控指标 2.1 线程池状态指标 ThreadPoolExecutor executor = new ThreadPoolExecutor( 10, 20, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(1000) ); // 1. 核心线程数 int corePoolSize = executor.getCorePoolSize(); // 2. 最大线程数 int maximumPoolSize = executor.getMaximumPoolSize(); // 3. 当前线程数 int poolSize = executor.getPoolSize(); // 4. 活跃线程数（正在执行任务的线程数） int activeCount = executor.getActiveCount(); // 5. 历史最大线程数 int largestPoolSize = executor.getLargestPoolSize(); // 6. 任务总数 long taskCount = executor.getTaskCount(); // 7. 已完成任务数 long completedTaskCount = executor.getCompletedTaskCount(); // 8. 队列中任务数 int queueSize = executor.getQueue().size(); // 9. 队列剩余容量 int remainingCapacity = executor.getQueue().remainingCapacity(); System.out.println("核心线程数：" + corePoolSize); System.out.println("最大线程数：" + maximumPoolSize); System.out.println("当前线程数：" + poolSize); System.out.println("活跃线程数：" + activeCount); System.out.println("队列中任务数：" + queueSize); System.out.println("已完成任务数：" + completedTaskCount); 2.2 关键指标说明指标说明正常范围异常信号活跃线程数/当前线程数线程利用率 60%-80% >90%：线程不足队列中任务数任务积压情况 <50% >80%：任务堆积任务完成速率处理能力稳定持续下降：性能问题拒绝任务数容量溢出 0 >0：需要扩容三、线程池监控实战 3.1 自定义监控线程池 public class MonitoredThreadPoolExecutor extends ThreadPoolExecutor { private final AtomicLong totalExecutionTime = new AtomicLong(0); private final AtomicLong rejectedTaskCount = new AtomicLong(0); public MonitoredThreadPoolExecutor(int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue) { super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, new MonitoredRejectedExecutionHandler()); } @Override protected void beforeExecute(Thread t, Runnable r) { super.beforeExecute(t, r); // 任务执行前的逻辑 } @Override protected void afterExecute(Runnable r, Throwable t) { super.afterExecute(r, t); // 任务执行后的逻辑 if (t != null) { System.err.println("任务执行异常：" + t.getMessage()); } // 统计执行时间（需要在Runnable中记录） } @Override protected void terminated() { super.terminated(); System.out.println("线程池已关闭"); printStatistics(); } // 自定义拒绝策略：记录拒绝次数 private class MonitoredRejectedExecutionHandler implements RejectedExecutionHandler { @Override public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) { rejectedTaskCount.incrementAndGet(); System.err.println("任务被拒绝！拒绝次数：" + rejectedTaskCount.get()); // 告警逻辑 if (rejectedTaskCount.get() % 100 == 0) { System.err.println("⚠️ 告警：已拒绝 " + rejectedTaskCount.get() + " 个任务！"); } // 降级逻辑：调用者线程执行 r.run(); } } // 打印统计信息 public void printStatistics() { System.out.println("========== 线程池统计 =========="); System.out.println("核心线程数：" + getCorePoolSize()); System.out.println("最大线程数：" + getMaximumPoolSize()); System.out.println("当前线程数：" + getPoolSize()); System.out.println("活跃线程数：" + getActiveCount()); System.out.println("历史最大线程数：" + getLargestPoolSize()); System.out.println("任务总数：" + getTaskCount()); System.out.println("已完成任务数：" + getCompletedTaskCount()); System.out.println("队列中任务数：" + getQueue().size()); System.out.println("拒绝任务数：" + rejectedTaskCount.get()); System.out.println("================================"); } } 3.2 定时监控 public class ThreadPoolMonitor { public static void startMonitoring(ThreadPoolExecutor executor) { ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); scheduler.scheduleAtFixedRate(() -> { System.out.println("========== 线程池监控 =========="); System.out.println("当前时间：" + LocalDateTime.now()); System.out.println("核心线程数：" + executor.getCorePoolSize()); System.out.println("最大线程数：" + executor.getMaximumPoolSize()); System.out.println("当前线程数：" + executor.getPoolSize()); System.out.println("活跃线程数：" + executor.getActiveCount()); System.out.println("队列大小：" + executor.getQueue().size()); System.out.println("已完成任务：" + executor.getCompletedTaskCount()); // 计算线程利用率 double threadUtilization = (double) executor.getActiveCount() / executor.getPoolSize() * 100; System.out.printf("线程利用率：%.2f%%\n", threadUtilization); // 计算队列使用率 BlockingQueue<Runnable> queue = executor.getQueue(); int queueCapacity = queue.size() + queue.remainingCapacity(); double queueUtilization = (double) queue.size() / queueCapacity * 100; System.out.printf("队列使用率：%.2f%%\n", queueUtilization); // 告警逻辑 if (threadUtilization > 90) { System.err.println("⚠️ 告警：线程利用率过高！"); } if (queueUtilization > 80) { System.err.println("⚠️ 告警：队列积压严重！"); } System.out.println("================================\n"); }, 0, 5, TimeUnit.SECONDS); // 每5秒监控一次 } public static void main(String[] args) { ThreadPoolExecutor executor = new ThreadPoolExecutor( 10, 20, 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(100) ); // 启动监控 startMonitoring(executor); // 提交任务... } } 四、线程池参数调优 4.1 核心线程数调优 // CPU密集型任务 int cpuCount = Runtime.getRuntime().availableProcessors(); int corePoolSize = cpuCount + 1; // N + 1 // I/O密集型任务 int corePoolSize = cpuCount * 2; // 2N // 混合型任务（推荐公式） // corePoolSize = N * (1 + WT/ST) // N = CPU核心数 // WT = 等待时间（I/O时间） // ST = 计算时间（CPU时间） // 例如： // CPU核心数：8 // 等待时间：90ms（I/O） // 计算时间：10ms（CPU） // corePoolSize = 8 * (1 + 90/10) = 8 * 10 = 80 int corePoolSize = cpuCount * (1 + waitTime / computeTime); 4.2 队列选择与容量 // 1. LinkedBlockingQueue（无界队列） // 优点：无限容量 // 缺点：可能OOM // 适用：内存充足，任务数可控 BlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(); // 2. LinkedBlockingQueue（有界队列） // 优点：防止OOM // 缺点：任务过多会拒绝 // 适用：需要控制内存 BlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(1000); // 3. ArrayBlockingQueue（有界队列） // 优点：数组实现，性能好 // 缺点：容量固定 // 适用：高性能场景 BlockingQueue<Runnable> queue = new ArrayBlockingQueue<>(1000); // 4. SynchronousQueue（无缓冲队列） // 优点：直接交付，吞吐量高 // 缺点：无缓冲，容易拒绝 // 适用：任务执行快，maximumPoolSize大 BlockingQueue<Runnable> queue = new SynchronousQueue<>(); // 5. PriorityBlockingQueue（优先级队列） // 优点：支持优先级 // 缺点：性能开销大 // 适用：需要优先级调度 BlockingQueue<Runnable> queue = new PriorityBlockingQueue<>(); 队列容量计算： ...

可观测性：监控、日志、链路追踪三位一体

引子：一次线上故障的排查噩梦 2021年某晚，某电商平台接口响应慢，用户投诉激增。排查过程：运维：哪个服务出问题了？（无监控）开发：日志在哪？（分散在100台服务器）架构师：调用链路是什么？（无链路追踪）耗时：3小时才定位到问题（数据库连接池配置错误）教训：微服务架构下，可观测性至关重要一、可观测性的本质 1.1 什么是可观测性？ **可观测性（Observability）**是指通过外部输出理解系统内部状态的能力。三大支柱： Metrics（指标）：数字化的度量（QPS、响应时间、错误率） Logs（日志）：事件的记录（请求日志、错误日志） Traces（追踪）：请求的全链路视图（调用链路） 1.2 监控 vs 可观测性维度监控可观测性目标已知问题未知问题方式预设指标任意维度查询例子 CPU > 80%告警为什么这个请求慢？二、监控体系：Prometheus + Grafana 2.1 监控指标的四个黄金信号延迟（Latency）：请求响应时间流量（Traffic）：QPS、TPS 错误（Errors）：错误率饱和度（Saturation）：CPU、内存、磁盘使用率 2.2 Prometheus监控配置 1. 添加依赖 <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-actuator</artifactId> </dependency> <dependency> <groupId>io.micrometer</groupId> <artifactId>micrometer-registry-prometheus</artifactId> </dependency> 2. 配置application.yml management: endpoints: web: exposure: include: "*" # 暴露所有端点 metrics: export: prometheus: enabled: true tags: application: ${spring.application.name} 3. 自定义指标 ...

监控告警体系：构建Redis可观测性

核心监控指标 1. 性能指标 @Component public class RedisMetricsCollector { @Autowired private RedisTemplate<String, String> redis; // QPS（每秒查询数） public long getQPS() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("stats") ); return Long.parseLong(info.getProperty("instantaneous_ops_per_sec")); } // 延迟 public long getLatency() { long start = System.currentTimeMillis(); redis.opsForValue().get("health_check"); return System.currentTimeMillis() - start; } // 命中率 public double getHitRate() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("stats") ); long hits = Long.parseLong(info.getProperty("keyspace_hits")); long misses = Long.parseLong(info.getProperty("keyspace_misses")); if (hits + misses == 0) { return 0; } return hits * 100.0 / (hits + misses); } // 慢查询数量 public long getSlowlogCount() { return redis.execute((RedisCallback<Long>) connection -> connection.slowlogLen() ); } } 2. 资源指标 // 内存使用 public Map<String, Object> getMemoryMetrics() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("memory") ); Map<String, Object> metrics = new HashMap<>(); metrics.put("used_memory", Long.parseLong(info.getProperty("used_memory"))); metrics.put("used_memory_rss", Long.parseLong(info.getProperty("used_memory_rss"))); metrics.put("mem_fragmentation_ratio", Double.parseDouble(info.getProperty("mem_fragmentation_ratio"))); metrics.put("evicted_keys", Long.parseLong(info.getProperty("evicted_keys"))); return metrics; } // CPU使用 public double getCPUUsage() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("cpu") ); return Double.parseDouble(info.getProperty("used_cpu_sys")); } // 连接数 public Map<String, Long> getConnectionMetrics() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("clients") ); Map<String, Long> metrics = new HashMap<>(); metrics.put("connected_clients", Long.parseLong(info.getProperty("connected_clients"))); metrics.put("blocked_clients", Long.parseLong(info.getProperty("blocked_clients"))); return metrics; } 3. 持久化指标 public Map<String, Object> getPersistenceMetrics() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("persistence") ); Map<String, Object> metrics = new HashMap<>(); // RDB metrics.put("rdb_last_save_time", Long.parseLong(info.getProperty("rdb_last_save_time"))); metrics.put("rdb_changes_since_last_save", Long.parseLong(info.getProperty("rdb_changes_since_last_save"))); // AOF if ("1".equals(info.getProperty("aof_enabled"))) { metrics.put("aof_current_size", Long.parseLong(info.getProperty("aof_current_size"))); metrics.put("aof_base_size", Long.parseLong(info.getProperty("aof_base_size"))); } return metrics; } 4. 复制指标 public Map<String, Object> getReplicationMetrics() { Properties info = redis.execute((RedisCallback<Properties>) connection -> connection.info("replication") ); Map<String, Object> metrics = new HashMap<>(); metrics.put("role", info.getProperty("role")); if ("master".equals(info.getProperty("role"))) { metrics.put("connected_slaves", Integer.parseInt(info.getProperty("connected_slaves"))); } else { metrics.put("master_link_status", info.getProperty("master_link_status")); metrics.put("master_last_io_seconds_ago", Integer.parseInt(info.getProperty("master_last_io_seconds_ago"))); } return metrics; } 告警规则 1. 性能告警 @Component public class PerformanceAlerting { @Autowired private RedisMetricsCollector metrics; @Scheduled(fixedRate = 60000) // 每分钟 public void checkPerformance() { // QPS过高 long qps = metrics.getQPS(); if (qps > 50000) { sendAlert("QPS告警", String.format("当前QPS: %d", qps)); } // 延迟过高 long latency = metrics.getLatency(); if (latency > 100) { sendAlert("延迟告警", String.format("当前延迟: %dms", latency)); } // 命中率过低 double hitRate = metrics.getHitRate(); if (hitRate < 80) { sendAlert("命中率告警", String.format("当前命中率: %.2f%%", hitRate)); } // 慢查询过多 long slowlogCount = metrics.getSlowlogCount(); if (slowlogCount > 100) { sendAlert("慢查询告警", String.format("慢查询数量: %d", slowlogCount)); } } private void sendAlert(String title, String message) { log.warn("{}: {}", title, message); // 发送钉钉/邮件/短信告警 } } 2. 资源告警 @Scheduled(fixedRate = 60000) public void checkResources() { // 内存使用 Map<String, Object> memMetrics = metrics.getMemoryMetrics(); long usedMemory = (long) memMetrics.get("used_memory"); long maxMemory = 4L * 1024 * 1024 * 1024; // 4GB if (usedMemory > maxMemory * 0.9) { sendAlert("内存告警", String.format("内存使用: %dMB / %dMB", usedMemory / 1024 / 1024, maxMemory / 1024 / 1024)); } // 内存碎片率 double fragRatio = (double) memMetrics.get("mem_fragmentation_ratio"); if (fragRatio > 1.5) { sendAlert("内存碎片告警", String.format("碎片率: %.2f", fragRatio)); } // 连接数 Map<String, Long> connMetrics = metrics.getConnectionMetrics(); long connectedClients = connMetrics.get("connected_clients"); if (connectedClients > 1000) { sendAlert("连接数告警", String.format("当前连接数: %d", connectedClients)); } } 3. 可用性告警 @Scheduled(fixedRate = 10000) // 每10秒 public void checkAvailability() { try { // 健康检查 redis.opsForValue().get("health_check"); } catch (Exception e) { sendAlert("Redis不可用", e.getMessage()); } // 主从复制状态 Map<String, Object> replMetrics = metrics.getReplicationMetrics(); if ("slave".equals(replMetrics.get("role"))) { String linkStatus = (String) replMetrics.get("master_link_status"); if (!"up".equals(linkStatus)) { sendAlert("主从复制断开", "master_link_status: " + linkStatus); } int lastIO = (int) replMetrics.get("master_last_io_seconds_ago"); if (lastIO > 60) { sendAlert("主从复制延迟", String.format("最后同步时间: %d秒前", lastIO)); } } } Prometheus + Grafana监控 1. 安装Redis Exporter docker run -d \ --name redis-exporter \ -p 9121:9121 \ oliver006/redis_exporter:latest \ --redis.addr=redis://redis:6379 2. Prometheus配置 # prometheus.yml scrape_configs: - job_name: 'redis' static_configs: - targets: ['redis-exporter:9121'] 3. Grafana Dashboard 导入官方Dashboard： ...

Sentinel Dashboard：可视化流控管理

引言：从命令行到可视化前面四篇，我们都是通过代码来配置限流规则： FlowRule rule = new FlowRule(); rule.setResource("myResource"); rule.setCount(100); FlowRuleManager.loadRules(Collections.singletonList(rule)); 这种方式虽然灵活，但有三大痛点：修改麻烦：改阈值需要改代码、重启服务不直观：看不到实时流量数据排查困难：出问题时无法快速定位 Sentinel Dashboard解决了这些问题，提供了： ✅ 实时监控：秒级数据刷新，QPS/RT/异常数一目了然 ✅ 规则配置：图形化配置，立即生效 ✅ 机器管理：多实例统一管理今天我们将学习如何安装和使用Sentinel Dashboard。一、Dashboard简介 1.1 Dashboard是什么 Sentinel Dashboard是Sentinel的可视化控制台，类似于： Nacos的控制台 Kubernetes的Dashboard Spring Boot Admin 核心功能： ┌────────────────────────────────────────┐ │ 实时监控 │ │ - QPS、RT、异常数 │ │ - 秒级刷新，实时图表 │ ├────────────────────────────────────────┤ │ 规则配置 │ │ - 流控规则、熔断规则、系统规则 │ │ - 图形化配置，立即生效 │ ├────────────────────────────────────────┤ │ 机器管理 │ │ - 机器列表、健康状态 │ │ - 多实例统一管理 │ └────────────────────────────────────────┘ 1.2 Dashboard的架构 ┌───────────────┐ │ 浏览器 │ │ localhost: │ │ 8080 │ └───────┬───────┘ │ HTTP ↓ ┌───────────────────────┐ │ Sentinel Dashboard │ ← 控制台（Java应用） │ 端口: 8080 │ └───────┬───────────────┘ │ 心跳 + 规则推送 ↓ ┌─────────────────────────────────────┐ │ 应用实例1 应用实例2 应用实例3 │ ← 接入的应用 │ port: 8719 port: 8719 port: 8719 │ └─────────────────────────────────────┘ 通信方式： ...