О каком метри c переплетении net следует предупреждать? - PullRequest
0 голосов
/ 14 февраля 2020

Плетение eNet предоставляет следующие метрики Прометея

enter image description here

Видят ли приведенные ниже предупреждения Правильно ли получать оповещения? При каких значениях этих метрик мы должны повышать уровень оповещения для отслеживания состояния плетения - net?

  • WeaveNoFastDP weave_flows [5м]> 0
  • WeaveIPAMUnreachable weave_ipam_unreachable_percentage> 0
  • WeaveIPAMPendingAllocates weave_ipam_pending_allocates> 0
  • WeavePendingClaims weave_ipam_pending_claims> 0
  • WeaveConnecTerm weave_connection_terminations_total> 300

1 Ответ

0 голосов
/ 28 февраля 2020

Сделана графана приборной панели поверх плетения метрик. Вот панели инструментов

  1. Плетение eNet https://grafana.com/grafana/dashboards/11789
  2. Плетение eNet (Кластер) https://grafana.com/grafana/dashboards/11804

Вот полезные метрики, по которым следует отслеживать переплетение net. Ниже приведены предупреждения в формате json.

enter image description here

{
  "groups": [
    {
      "name": "nodeagent",
      "rules": [
        {
          "alert": "UnhealthyNodes",
          "expr": "changes(central_nodeagent:node_route_unhealthy_count[3m]) > 0",
          "for": "1m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "Unhealthy nodes in the cluster. Go to prometheus the below prometheus link for details.",
            "description": "Actionable: Find why the node(s) are unhealthy and fix it."
          }
        }
      ]
    },
    {
      "name": "weave-net",
      "rules": [
        {
          "alert": "WeaveNetIPAMSPlitBrain",
          "expr": "max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNetIPAM has a split brain. Go to the below prometheus link for details.",
            "description": "Actionable: Every node should see same unreachability percentage. Please check and fix why it is not so."
          }
        },
        {
          "alert": "WeaveNetIPAMUnreachable",
          "expr": "weave_ipam_unreachable_percentage[10m] > 25",
          "for": "10m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNetIPAM unreachability percentage is above threshold. Go to the below prometheus link for details.",
            "description": "Actionable: Find why the unreachability threshold have increased from threshold and fix it. WeaveNet is responsible to keep it under control. Weave rm peer deployment can help clean things."
          }
        },
        {
          "alert": "WeaveNetIPAMPendingAllocates",
          "expr": "sum(weave_ipam_pending_allocates) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNet IPAM has pending allocates. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for IPAM allocates to be in pending state and fix it."
          }
        },
        {
          "alert": "WeaveNetIPAMPendingClaims",
          "expr": "sum(weave_ipam_pending_claims) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNet IPAM has pending claims. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for IPAM claims to be in pending state and fix it."
          }
        },
        {
          "alert": "WeaveNetFastDPFlowsLow",
          "expr": "sum(weave_flows) < 15000",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNet total FastDP flows is below threshold. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for fast dp flows dropping below the threshold."
          }
        },
        {
          "alert": "WeaveNetFastDPFlowsOff",
          "expr": "sum(weave_flows == bool 0) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNet FastDP flows is not happening in some or all nodes. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for fast dp being off."
          }
        },
        {
          "alert": "WeaveNetHighConnectionTerminationRate",
          "expr": "rate(weave_connection_terminations_total[5m]) > 0.1",
          "for": "5m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are getting terminated. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for high connection termination rate and fix it."
          }
        },
        {
          "alert": "WeaveNetConnectionsConnecting",
          "expr": "sum(weave_connections{state='connecting'}) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are in connecting state. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason and fix it."
          }
        },
        {
          "alert": "WeaveNetConnectionsRetying",
          "expr": "sum(weave_connections{state='retrying'}) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are in retrying state. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason and fix it."
          }
        },
        {
          "alert": "WeaveNetConnectionsPending",
          "expr": "sum(weave_connections{state='pending'}) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are in pending state. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason and fix it."
          }
        },
        {
          "alert": "WeaveNetConnectionsFailed",
          "expr": "sum(weave_connections{state='failed'}) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are in failed state. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason and fix it."
          }
        }
      ]
    }
  ]
}
...