prometheus安装部署
1、下载安装包
https://prometheus.io/download/
2、修改配置文件
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
- "192.168.123.13:9093" # alertmanager告警服务地址
# Load rules once and periodically evaluate them according to the global \'evaluation_interval\'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "rules/*.yml" # 告警规则配置文件
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['192.168.123.13:9090'] # prometheus自身的监控
#- job_name: 'node' # 此处自定义每个node_exporter的labels标签信息
# static_configs:
# - targets: ["192.168.123.10:9100"]
# labels:
# env: "prod"
# project: "node-exporter"
# addr: "192.168.123.10"
# instance: "node-2"
# - targets: ["192.168.123.11:9100"]
# labels:
# env: "prod"
# project: "node-exporter"
# addr: "192.168.123.11"
# instance: "node-3"
#- job_name: 'geth' # 此处集中定义同一服务的监控端口信息
# metrics_path: /debug/metrics/prometheus
# scheme: http
# static_configs:
# - targets:
# - "192.168.123.13:6060"
# - "192.168.123.10:6060"
# - "192.168.123.9:6060"
- job_name: 'node' # 此处定义了自动发现的采集任务名称,可以依据自己的业务定义多个自动发现任务
file_sd_configs:
- files:
- "conf.d/nodes/*.json" # 采集文件路径
refresh_interval: 30s # 自动发现间隔时间,默认5m
- job_name: geth
metrics_path: /debug/metrics/prometheus
scheme: http
file_sd_configs:
- files:
- "conf.d/project/geth.json" # 采集文件路径
refresh_interval: 30s # 自动发现间隔时间,默认5m
- job_name: mysql # 定义mysql_exporter的监控端口
metrics_path: /metrics
scheme: http
static_configs:
- targets:
- "192.168.123.13:9104"
3、编写告警规则配置
#
groups:
- name: Host-Group-001 # 组的名字,在这个文件中必须要唯一
rules:
- alert: InstanceDown # 告警的名字,在组中需要唯一
expr: up == 0 # 表达式, 执行结果为true: 表示需要告警, up=1服务在线,up=0服务下线
for: 30s # 超过多少时间才认为需要告警(即up==0需要持续的时间)
labels:
alert_type: project-alert # 预定义告警类型,webhook接口需要,已定义:(host-alert,project-alert,others)
severity: critical # 定义标签
annotations:
summary: "服务 {{ $labels.project }} 下线了"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.(currentValue={{ $value }}),LABELS = {{ $labels }}"
value: "{{ $value }}"
- alert: HighCpuUsage
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance,env,group,job,project,addr,port) * 100) > 85
for: 5m
labels:
alert_type: host-alert
severity: "critical"
annotations:
summary: "CPU使用率超过85%"
# description: "Project: {{$labels.project}}, 主机: {{ $labels.instance }}, IP: {{$labels.addr}} CPU使用率为: {{ $value| printf `%.2f`}}%"
description: "实例: {{$labels.instance}} CPU使用率为{{ $value| printf `%.2f`}}%"
value: "{{ $value }}"
4、编写服务启动脚本
vim prometheus.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=root
Group=root
WorkingDirectory=/usr/local/prometheus
ExecStart=/bin/sh -c '/usr/local/prometheus/prometheus --config.file=prometheus.yml --storage.tsdb.path=data/ --storage.tsdb.retention.time=730d --web.enable-remote-write-receiver --web.enable-lifecycle >>/usr/local/prometheus/prometheus.log 2>&1 '
ExecReload=/bin/kill -HUP $MAINPID
ExecStop=/bin/kill -s QUIT $MAINPID
Restart=on-failure
[Install]
WantedBy=multi-user.target
然后复制到/usr/lib/systemd/system/目录下
5、启动服务
systemctl enable prometheus # 开机自启
systemctl start prometheus # 启动服务