Skip to content

Commit

Permalink
Improve events timestamps
Browse files Browse the repository at this point in the history
  • Loading branch information
ReallyLiri committed Oct 28, 2021
1 parent aac4674 commit ead8d2d
Show file tree
Hide file tree
Showing 10 changed files with 108 additions and 130 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ TBD

```
NAME:
kubescout - 0.1.5 - Scout for alarming issues in your Kubernetes cluster
kubescout - 0.1.7 - Scout for alarming issues in your Kubernetes cluster
USAGE:
kubescout [optional flags]
Expand All @@ -191,7 +191,6 @@ OPTIONS:
--dedup-minutes value, -d value time in minutes to silence duplicate or already observed alerts, or 0 to disable deduplication (default: 60)
--store-filepath value, -s value path to store file where state will be persisted or empty string to disable persistency (default: "kube-scout.store.json")
--output value, -o value output mode, one of pretty/json/yaml/discard (default: "pretty")
--iterations value, --it value number of diag iterations, meant to better capture constantly changing states (default: 3)
--context value, -c value context name to use from kubeconfig, defaults to current context
--all-contexts, -a iterate all kubeconfig contexts, 'context' flag will be ignored if this flag is set (default: false)
--exclude-contexts value a comma separated list of kubeconfig context names to skip, only relevant if 'all-contexts' flag is set
Expand Down
13 changes: 0 additions & 13 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ type Config struct {
MessagesDeduplicationDuration time.Duration
StoreFilePath string
OutputMode string
Iterations int
ContextName string
AllContexts bool
ExcludeContexts []string
Expand Down Expand Up @@ -139,13 +138,6 @@ var Flags = []cli.Flag{
Usage: "output mode, one of pretty/json/yaml/discard",
Required: false,
},
&cli.IntFlag{
Name: "iterations",
Aliases: []string{"it"},
Value: 3,
Usage: "number of diag iterations, meant to better capture constantly changing states",
Required: false,
},
&cli.StringFlag{
Name: "context",
Aliases: []string{"c"},
Expand Down Expand Up @@ -221,7 +213,6 @@ func ParseConfig(c *cli.Context) (*Config, error) {
MessagesDeduplicationDuration: time.Minute * time.Duration(c.Int("dedup-minutes")),
StoreFilePath: c.String("store-filepath"),
OutputMode: c.String("output"),
Iterations: c.Int("iterations"),
ContextName: c.String("context"),
AllContexts: c.Bool("all-contexts"),
ExcludeContexts: splitListFlag(c.String("exclude-contexts")),
Expand Down Expand Up @@ -265,10 +256,6 @@ func ParseConfig(c *cli.Context) (*Config, error) {
}
}

if config.Iterations <= 0 {
return nil, fmt.Errorf("number of iterations is invalid: %v", config.Iterations)
}

return config, nil
}

Expand Down
27 changes: 9 additions & 18 deletions diag/diag.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package diag

import (
"github.com/drgrib/iter"
"github.com/reallyliri/kubescout/alert"
"github.com/reallyliri/kubescout/config"
"github.com/reallyliri/kubescout/internal"
Expand All @@ -12,8 +11,6 @@ import (
"time"
)

const sleepBetweenIterations = time.Second * time.Duration(3)

type diagContext struct {
config *config.Config
store *store.ClusterStore
Expand Down Expand Up @@ -156,15 +153,9 @@ func DiagnoseCluster(client kubeclient.KubernetesClient, cfg *config.Config, sto

log.Infof("Diagnosing cluster %v ...", store.Cluster)

for i := range iter.N(cfg.Iterations) {
if i > 0 {
time.Sleep(sleepBetweenIterations)
context.now = context.now.Add(sleepBetweenIterations)
}
err := context.clusterIteration(i)
if err != nil {
return err
}
err := context.collectStates()
if err != nil {
return err
}

for name, state := range context.statesByName {
Expand All @@ -183,7 +174,7 @@ func DiagnoseCluster(client kubeclient.KubernetesClient, cfg *config.Config, sto
return
}

func (context *diagContext) clusterIteration(i int) error {
func (context *diagContext) collectStates() error {
client := context.client
namespaces, err := client.GetNamespaces()
if err != nil {
Expand All @@ -192,7 +183,7 @@ func (context *diagContext) clusterIteration(i int) error {

var aggregatedError error

log.Debugf("Discovered %v namespaces (iter=%v)", len(namespaces), i)
log.Debugf("Discovered %v namespaces", len(namespaces))
for _, namespace := range namespaces {
namespaceName := namespace.Name
if !context.isNamespaceRelevant(namespaceName) {
Expand All @@ -203,7 +194,7 @@ func (context *diagContext) clusterIteration(i int) error {
if err != nil {
aggregatedError = multierr.Append(aggregatedError, err)
} else {
log.Debugf("Discovered %v events in namespace %v (iter=%v)", len(events), namespaceName, i)
log.Debugf("Discovered %v events in namespace %v", len(events), namespaceName)
for _, event := range events {
_, err = context.eventState(&event)
if err != nil {
Expand All @@ -216,7 +207,7 @@ func (context *diagContext) clusterIteration(i int) error {
if err != nil {
aggregatedError = multierr.Append(aggregatedError, err)
} else {
log.Debugf("Discovered %v pods in namespace %v (iter=%v)", len(pods), namespaceName, i)
log.Debugf("Discovered %v pods in namespace %v", len(pods), namespaceName)
for _, pod := range pods {
_, err = context.podState(&pod)
if err != nil {
Expand All @@ -229,7 +220,7 @@ func (context *diagContext) clusterIteration(i int) error {
if err != nil {
aggregatedError = multierr.Append(aggregatedError, err)
} else {
log.Debugf("Discovered %v replica sets in namespace %v (iter=%v)", len(replicaSets), namespaceName, i)
log.Debugf("Discovered %v replica sets in namespace %v", len(replicaSets), namespaceName)
for _, replicaSet := range replicaSets {
_, err = context.replicaSetState(&replicaSet)
if err != nil {
Expand All @@ -243,7 +234,7 @@ func (context *diagContext) clusterIteration(i int) error {
if err != nil {
aggregatedError = multierr.Append(aggregatedError, err)
} else {
log.Debugf("Discovered %v nodes (iter=%v)", len(nodes), i)
log.Debugf("Discovered %v nodes", len(nodes))
for _, node := range nodes {
_, err = context.nodeState(&node, false)
if err != nil {
Expand Down
16 changes: 8 additions & 8 deletions diag/diag_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func TestDiagnose(t *testing.T) {
assert.Equal(t, "Node", alerts[i].Kind)
assert.Equal(t, 0, len(alerts[i].Messages))
assert.Equal(t, 1, len(alerts[i].Events))
assert.Equal(t, `Event by sysctl-monitor: NodeSysctlChange x53 since 17 Oct 21 14:15 UTC (last seen 4 minutes ago)`, alerts[i].Events[0])
assert.Equal(t, `Event by sysctl-monitor: NodeSysctlChange x53 since 17 Oct 21 14:15 UTC, 4 minutes ago`, alerts[i].Events[0])
assert.Equal(t, 0, len(alerts[i].LogsByContainerName))

i = 1
Expand All @@ -78,11 +78,11 @@ func TestDiagnose(t *testing.T) {
assert.Equal(t, 1, len(alerts[i].Messages))
assert.Equal(t, "Container test-2-broken-image still waiting due to ImagePullBackOff: Back-off pulling image \"nginx:l4t3st\"", alerts[i].Messages[0])
assert.Equal(t, 3, len(alerts[i].Events))
assert.Equal(t, `Event by kubelet: Failed x4 since 17 Oct 21 14:15 UTC (last seen 2 minutes ago):
assert.Equal(t, `Event by kubelet: Failed x4 since 17 Oct 21 14:15 UTC, 4 minutes ago (last seen 2 minutes ago):
Failed to pull image "nginx:l4t3st": rpc error: code = Unknown desc = Error response from daemon: manifest for nginx:l4t3st not found: manifest unknown: manifest unknown`, alerts[i].Events[0])
assert.Equal(t, `Event by kubelet: Failed x4 since 17 Oct 21 14:15 UTC (last seen 2 minutes ago):
assert.Equal(t, `Event by kubelet: Failed x4 since 17 Oct 21 14:15 UTC, 4 minutes ago (last seen 2 minutes ago):
Error: ErrImagePull`, alerts[i].Events[1])
assert.Equal(t, `Event by kubelet: Failed x6 since 17 Oct 21 14:15 UTC (last seen 2 minutes ago):
assert.Equal(t, `Event by kubelet: Failed x6 since 17 Oct 21 14:15 UTC, 4 minutes ago (last seen 2 minutes ago):
Error: ImagePullBackOff`, alerts[i].Events[2])
assert.Equal(t, 1, len(alerts[i].LogsByContainerName))
assert.Equal(t, "default/test-2-broken-image-7cbf974df9-4jv8f/test-2-broken-image/logs", alerts[i].LogsByContainerName["test-2-broken-image"])
Expand All @@ -95,7 +95,7 @@ func TestDiagnose(t *testing.T) {
assert.Equal(t, 1, len(alerts[i].Messages))
assert.Equal(t, "Unschedulable: 0/1 nodes are available: 1 Insufficient memory. (last transition: 4 minutes ago)", alerts[i].Messages[0])
assert.Equal(t, 1, len(alerts[i].Events))
assert.Equal(t, `Event by default-scheduler: FailedScheduling since 17 Oct 21 14:15 UTC (last seen 4 minutes ago):
assert.Equal(t, `Event by default-scheduler: FailedScheduling since 17 Oct 21 14:15 UTC, 4 minutes ago:
0/1 nodes are available: 1 Insufficient memory.`, alerts[i].Events[0])
assert.Equal(t, 0, len(alerts[i].LogsByContainerName))

Expand All @@ -107,7 +107,7 @@ func TestDiagnose(t *testing.T) {
assert.Equal(t, 1, len(alerts[i].Messages))
assert.Equal(t, "Container test-4-crashlooping is in CrashLoopBackOff: restarted 4 times, last exit due to Error (exit code 1)", alerts[i].Messages[0])
assert.Equal(t, 1, len(alerts[i].Events))
assert.Equal(t, `Event by kubelet: BackOff x8 since 17 Oct 21 14:15 UTC (last seen 3 minutes ago):
assert.Equal(t, `Event by kubelet: BackOff x8 since 17 Oct 21 14:15 UTC, 4 minutes ago (last seen 3 minutes ago):
Back-off restarting failed container`, alerts[i].Events[0])
assert.Equal(t, 1, len(alerts[i].LogsByContainerName))
assert.Equal(t, "default/test-4-crashlooping-dbdd84589-8m7kj/test-4-crashlooping/logs", alerts[i].LogsByContainerName["test-4-crashlooping"])
Expand All @@ -120,7 +120,7 @@ func TestDiagnose(t *testing.T) {
assert.Equal(t, 1, len(alerts[i].Messages))
assert.Equal(t, "Container test-5-completed is in CrashLoopBackOff: restarted 4 times, last exit due to Completed (exit code 0)", alerts[i].Messages[0])
assert.Equal(t, 1, len(alerts[i].Events))
assert.Equal(t, `Event by kubelet: BackOff x8 since 17 Oct 21 14:15 UTC (last seen 2 minutes ago):
assert.Equal(t, `Event by kubelet: BackOff x8 since 17 Oct 21 14:15 UTC, 4 minutes ago (last seen 2 minutes ago):
Back-off restarting failed container`, alerts[i].Events[0])
assert.Equal(t, 1, len(alerts[i].LogsByContainerName))
assert.Equal(t, "default/test-5-completed-757685986-qxbqp/test-5-completed/logs", alerts[i].LogsByContainerName["test-5-completed"])
Expand All @@ -133,7 +133,7 @@ func TestDiagnose(t *testing.T) {
assert.Equal(t, 1, len(alerts[i].Messages))
assert.Equal(t, "Container test-6-crashlooping-init-container (init) is in CrashLoopBackOff: restarted 4 times, last exit due to Error (exit code 1)", alerts[i].Messages[0])
assert.Equal(t, 1, len(alerts[i].Events))
assert.Equal(t, `Event by kubelet: BackOff x7 since 17 Oct 21 14:15 UTC (last seen 3 minutes ago):
assert.Equal(t, `Event by kubelet: BackOff x7 since 17 Oct 21 14:15 UTC, 4 minutes ago (last seen 3 minutes ago):
Back-off restarting failed container`, alerts[i].Events[0])
assert.Equal(t, 1, len(alerts[i].LogsByContainerName))
assert.Equal(t, "default/test-6-crashlooping-init-644545f5b7-l468n/test-6-crashlooping-init-container/logs", alerts[i].LogsByContainerName["test-6-crashlooping-init-container"])
Expand Down
39 changes: 20 additions & 19 deletions diag/events_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"github.com/reallyliri/kubescout/internal"
"github.com/reallyliri/kubescout/kubeclient"
log "github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"strings"
"testing"
Expand All @@ -29,7 +30,7 @@ func TestEventState_StandardEvents(t *testing.T) {
require.NotEmpty(t, state.name.name)
messages := strings.Split(state.cleanMessage(), "\n")
require.Equal(t, 4, len(messages))
require.Equal(t, "Event by kubelet: Unhealthy x2 since 12 Oct 21 13:54 UTC (last seen 26 seconds ago):", messages[0])
require.Equal(t, "Event by kubelet: Unhealthy x2 since 12 Oct 21 13:54 UTC, 41 seconds ago (last seen 26 seconds ago):", messages[0])
require.Equal(t, "\tLiveness probe failed: % Total % Received % Xferd Average Speed Time Time Time Current", messages[1])
require.Equal(t, "\tDload Upload Total Spent Left Speed", messages[2])
require.Equal(t, "\t0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0curl: (7) Failed to connect to localhost port 8095: Connection refused", messages[3])
Expand Down Expand Up @@ -57,39 +58,39 @@ func TestEventState_MountFailedEvents(t *testing.T) {
require.False(t, state.isHealthy())
require.NotEmpty(t, state.name.name)
messages := strings.Split(state.cleanMessage(), "\n")
require.Equal(t, 2, len(messages))
require.Equal(t, "Event by kubelet: Failed x351 since 12 Oct 21 12:00 UTC (last seen 9 minutes ago):", messages[0])
require.Equal(t, "\tError: ImagePullBackOff", messages[1])
assert.Equal(t, 2, len(messages))
assert.Equal(t, "Event by kubelet: Failed x351 since 12 Oct 21 12:00 UTC, 1 hour ago (last seen 9 minutes ago):", messages[0])
assert.Equal(t, "\tError: ImagePullBackOff", messages[1])

state, err = testContext(now).eventState(&events[10])
require.Nil(t, err)
log.Debugf("%v) %v", 10, state)
require.False(t, state.isHealthy())
require.NotEmpty(t, state.name.name)
messages = strings.Split(state.cleanMessage(), "\n")
require.Equal(t, 2, len(messages))
require.Equal(t, "Event by default-scheduler: FailedScheduling x476 since 12 Oct 21 12:01 UTC (last seen 4 minutes ago):", messages[0])
require.Equal(t, "\t0/7 nodes are available: 7 Insufficient memory.", messages[1])
assert.Equal(t, 2, len(messages))
assert.Equal(t, "Event by default-scheduler: FailedScheduling x476 since 12 Oct 21 12:01 UTC, 1 hour ago (last seen 4 minutes ago):", messages[0])
assert.Equal(t, "\t0/7 nodes are available: 7 Insufficient memory.", messages[1])

state, err = testContext(now).eventState(&events[11])
require.Nil(t, err)
log.Debugf("%v) %v", 11, state)
require.False(t, state.isHealthy())
require.NotEmpty(t, state.name.name)
messages = strings.Split(state.cleanMessage(), "\n")
require.Equal(t, 2, len(messages))
require.Equal(t, "Event by kubelet: FailedMount x10 since 12 Oct 21 12:02 UTC (last seen 3 minutes ago):", messages[0])
require.Equal(t, "\tUnable to attach or mount volumes: unmounted volumes=[nginx-pvc], unattached volumes=[default-token-6xwwv nginx-pvc]: timed out waiting for the condition", messages[1])
assert.Equal(t, 2, len(messages))
assert.Equal(t, "Event by kubelet: FailedMount x10 since 12 Oct 21 12:02 UTC, 1 hour ago (last seen 3 minutes ago):", messages[0])
assert.Equal(t, "\tUnable to attach or mount volumes: unmounted volumes=[nginx-pvc], unattached volumes=[default-token-6xwwv nginx-pvc]: timed out waiting for the condition", messages[1])

state, err = testContext(now).eventState(&events[12])
require.Nil(t, err)
log.Debugf("%v) %v", 12, state)
require.False(t, state.isHealthy())
require.NotEmpty(t, state.name.name)
messages = strings.Split(state.cleanMessage(), "\n")
require.Equal(t, 2, len(messages))
require.Equal(t, "Event by kubelet: FailedMount x28 since 12 Oct 21 12:05 UTC (last seen 5 minutes ago):", messages[0])
require.Equal(t, "\tUnable to attach or mount volumes: unmounted volumes=[nginx-pvc], unattached volumes=[nginx-pvc default-token-6xwwv]: timed out waiting for the condition", messages[1])
assert.Equal(t, 2, len(messages))
assert.Equal(t, "Event by kubelet: FailedMount x28 since 12 Oct 21 12:05 UTC, 1 hour ago (last seen 5 minutes ago):", messages[0])
assert.Equal(t, "\tUnable to attach or mount volumes: unmounted volumes=[nginx-pvc], unattached volumes=[nginx-pvc default-token-6xwwv]: timed out waiting for the condition", messages[1])
}

func TestEventState_NodeProblemDetector(t *testing.T) {
Expand All @@ -107,18 +108,18 @@ func TestEventState_NodeProblemDetector(t *testing.T) {
require.False(t, state.isHealthy())
require.NotEmpty(t, state.name.name)
messages := strings.Split(state.cleanMessage(), "\n")
require.Equal(t, 1, len(messages))
require.Equal(t, "Event by sysctl-monitor: NodeSysctlChange x29 since 07 Oct 21 05:24 UTC (last seen 1 hour ago)", messages[0])
assert.Equal(t, 1, len(messages))
assert.Equal(t, "Event by sysctl-monitor: NodeSysctlChange x29 since 07 Oct 21 05:24 UTC, 1 week ago (last seen 1 hour ago)", messages[0])

state, err = testContext(now).eventState(&events[1])
require.Nil(t, err)
log.Debugf("%v) %v", 1, state)
require.False(t, state.isHealthy())
require.NotEmpty(t, state.name.name)
messages = strings.Split(state.cleanMessage(), "\n")
require.Equal(t, 2, len(messages))
require.Equal(t, "Event by kernel-monitor: KernelOops since 14 Oct 21 06:10 UTC (last seen 19 minutes ago):", messages[0])
require.Equal(t, "\tkernel: BUG: unable to handle kernel NULL pointer dereference at TESTING", messages[1])
assert.Equal(t, 2, len(messages))
assert.Equal(t, "Event by kernel-monitor: KernelOops since 14 Oct 21 06:10 UTC, 19 minutes ago:", messages[0])
assert.Equal(t, "\tkernel: BUG: unable to handle kernel NULL pointer dereference at TESTING", messages[1])
}

func TestEventState_FailedJobs(t *testing.T) {
Expand All @@ -141,7 +142,7 @@ func TestEventState_FailedJobs(t *testing.T) {
require.NotEmpty(t, state.name.name)
messages := strings.Split(state.cleanMessage(), "\n")
require.Equal(t, 2, len(messages))
require.Equal(t, "Event by job-controller: BackoffLimitExceeded since 21 Oct 21 10:06 UTC (last seen 53 minutes ago):", messages[0])
require.Equal(t, "Event by job-controller: BackoffLimitExceeded since 21 Oct 21 10:06 UTC, 53 minutes ago:", messages[0])
require.Equal(t, "\tJob has reached the specified backoff limit", messages[1])
}

Expand Down
Loading

0 comments on commit ead8d2d

Please sign in to comment.