prometheus安装部署

创建于 3年前 / 阅读数 0 / 评论数 0 / 1年前

1、下载安装包

2、修改配置文件

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      # - alertmanager:9093
      - "192.168.123.13:9093"    # alertmanager告警服务地址

# Load rules once and periodically evaluate them according to the global \'evaluation_interval\'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "rules/*.yml"      # 告警规则配置文件

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['192.168.123.13:9090']           # prometheus自身的监控
  
   
  #- job_name: 'node'       # 此处自定义每个node_exporter的labels标签信息
  #  static_configs:
  #  - targets: ["192.168.123.10:9100"]
  #    labels:
  #      env: "prod"
  #      project: "node-exporter"
  #      addr: "192.168.123.10"
  #      instance: "node-2"
  #  - targets: ["192.168.123.11:9100"]
  #    labels:
  #      env: "prod"
  #      project: "node-exporter"
  #      addr: "192.168.123.11"
  #      instance: "node-3"

  #- job_name: 'geth'                  # 此处集中定义同一服务的监控端口信息
  #  metrics_path: /debug/metrics/prometheus
  #  scheme: http
  #  static_configs:
  #  - targets:
  #    - "192.168.123.13:6060"
  #    - "192.168.123.10:6060"
  #    - "192.168.123.9:6060"

  - job_name: 'node'         # 此处定义了自动发现的采集任务名称，可以依据自己的业务定义多个自动发现任务
    file_sd_configs:
    - files:
      - "conf.d/nodes/*.json"         # 采集文件路径
      refresh_interval: 30s            # 自动发现间隔时间，默认5m

  - job_name: geth
    metrics_path: /debug/metrics/prometheus
    scheme: http
    file_sd_configs:
    - files:
      - "conf.d/project/geth.json"         # 采集文件路径
      refresh_interval: 30s                  # 自动发现间隔时间，默认5m
 
  - job_name: mysql        # 定义mysql_exporter的监控端口
    metrics_path: /metrics
    scheme: http
    static_configs:
    - targets:
      - "192.168.123.13:9104"

3、编写告警规则配置

# 
groups:
- name: Host-Group-001 # 组的名字，在这个文件中必须要唯一
  rules:
  - alert: InstanceDown # 告警的名字，在组中需要唯一
    expr: up == 0 # 表达式, 执行结果为true: 表示需要告警, up=1服务在线，up=0服务下线
    for: 30s # 超过多少时间才认为需要告警(即up==0需要持续的时间)
    labels:
      alert_type: project-alert    # 预定义告警类型，webhook接口需要，已定义:(host-alert,project-alert,others)
      severity: critical # 定义标签
    annotations:
      summary: "服务 {{ $labels.project }} 下线了"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes.（currentValue={{ $value }}）,LABELS = {{ $labels }}"
      value: "{{ $value }}"


  - alert: HighCpuUsage
    expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance,env,group,job,project,addr,port) * 100) > 85
    for: 5m
    labels: 
      alert_type: host-alert
      severity: "critical"
    annotations: 
      summary: "CPU使用率超过85%"
      # description: "Project: {{$labels.project}}, 主机: {{ $labels.instance }}, IP: {{$labels.addr}} CPU使用率为: {{ $value| printf `%.2f`}}%"
      description: "实例: {{$labels.instance}} CPU使用率为{{ $value| printf `%.2f`}}%"
      value: "{{ $value }}"

4、编写服务启动脚本

vim  prometheus.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/
After=network.target

[Service]
Type=simple
User=root
Group=root
WorkingDirectory=/usr/local/prometheus
ExecStart=/bin/sh -c '/usr/local/prometheus/prometheus --config.file=prometheus.yml --storage.tsdb.path=data/ --storage.tsdb.retention.time=730d --web.enable-remote-write-receiver --web.enable-lifecycle >>/usr/local/prometheus/prometheus.log 2>&1 '
ExecReload=/bin/kill -HUP $MAINPID
ExecStop=/bin/kill -s QUIT $MAINPID
Restart=on-failure

[Install]
WantedBy=multi-user.target

然后复制到/usr/lib/systemd/system/目录下

5、启动服务

systemctl  enable prometheus    # 开机自启
systemctl  start prometheus        # 启动服务

6、浏览器打开http://localhost:9090/ 即可看到prometheus的页面

讨论数量: 0