Skip to content

Commit 71a94c0

Browse files
authored
Patch kube-vip to retry exponentially on transient 403/401 errors during service watcher initializtion. (#5256)
Signed-off-by: Rahul <rahulgab@amazon.com>
1 parent 7de4339 commit 71a94c0

File tree

3 files changed

+90
-3
lines changed

3 files changed

+90
-3
lines changed
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
5d0779b8bfffe0a71bbac9b9e4f2a06a7fbce225b7c8bb06d85292f5f1fcbfe5 _output/bin/kube-vip/linux-amd64/kube-vip
2-
1d49ff52c9589cce99673743a121ed99d9040847c6490426bc16899e5af30892 _output/bin/kube-vip/linux-arm64/kube-vip
1+
5f784e3a1abd274437359e49fb858df1cb78a90b4ca14f5e78b9f5f92a7784ab _output/bin/kube-vip/linux-amd64/kube-vip
2+
1738d010ac96fdca0d0610151eef84b63965595c2a57cc5fcf7fad9782d2660b _output/bin/kube-vip/linux-arm64/kube-vip

projects/kube-vip/kube-vip/patches/0001-use-hostname-instead-of-kubernetes-to-contact-apiser.patch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
From 40bf11fd0b44f39ad39de8f65eb664a2172ed983 Mon Sep 17 00:00:00 2001
22
From: Abhinav Pandey <abhinavmpandey08@gmail.com>
33
Date: Wed, 2 Mar 2022 16:40:11 -0800
4-
Subject: [PATCH] use hostname instead of "kubernetes" to contact apiserver
4+
Subject: [PATCH 1/2] use hostname instead of "kubernetes" to contact apiserver
55

66
---
77
pkg/kubevip/config_generator.go | 7 -------
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
From a39f3f7e30dc36812881e8f43d50b73abb5d2aba Mon Sep 17 00:00:00 2001
2+
From: Rahul Ganesh <rahulgab@amazon.com>
3+
Date: Fri, 20 Mar 2026 11:51:16 -0700
4+
Subject: [PATCH 2/2] retry on 403/401 in ServicesWatcher with exponential backoff
5+
6+
Retry WatchFn with exponential backoff when service watcher starts against
7+
transient 403 and 401 errors. On joining CP nodes with K8s 1.34+, the local
8+
etcd may still be a learner when kube-vip starts, causing RBAC data
9+
to be unavailable through the learner until it graduates.
10+
11+
Signed-off-by: Rahul Ganesh <rahulgab@amazon.com>
12+
---
13+
pkg/services/watch_services.go | 39 +++++++++++++++++++++++++++++++++-
14+
1 file changed, 38 insertions(+), 1 deletion(-)
15+
16+
diff --git a/pkg/services/watch_services.go b/pkg/services/watch_services.go
17+
index 8c27ec6..58631ba 100644
18+
--- a/pkg/services/watch_services.go
19+
+++ b/pkg/services/watch_services.go
20+
@@ -3,6 +3,7 @@ package services
21+
import (
22+
"context"
23+
"fmt"
24+
+ "time"
25+
26+
log "log/slog"
27+
28+
@@ -14,11 +15,45 @@ import (
29+
v1 "k8s.io/api/core/v1"
30+
apierrors "k8s.io/apimachinery/pkg/api/errors"
31+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
32+
+ "k8s.io/apimachinery/pkg/util/wait"
33+
"k8s.io/apimachinery/pkg/watch"
34+
"k8s.io/client-go/tools/cache"
35+
watchtools "k8s.io/client-go/tools/watch"
36+
)
37+
38+
+// watchWithAuthRetry retries watchFn with exponential backoff on transient 403 Forbidden
39+
+// and 401 Unauthorized errors. On joining control plane nodes with K8s 1.34+, the local
40+
+// etcd may still be a learner when kube-vip starts, causing RBAC data to be unavailable.
41+
+// These auth errors resolve once etcd is promoted to a full member (typically within seconds).
42+
+// Non-auth errors are returned immediately. Context cancellation stops the retry loop.
43+
+func watchWithAuthRetry(ctx context.Context, watchFn func(context.Context) (watch.Interface, error)) (watch.Interface, error) {
44+
+ var w watch.Interface
45+
+ var lastErr error
46+
+ err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{
47+
+ Duration: 2 * time.Second,
48+
+ Factor: 2.0,
49+
+ Jitter: 0.1,
50+
+ Steps: 10,
51+
+ Cap: 30 * time.Second,
52+
+ }, func(ctx context.Context) (bool, error) {
53+
+ var watchErr error
54+
+ w, watchErr = watchFn(ctx)
55+
+ if watchErr == nil {
56+
+ return true, nil
57+
+ }
58+
+ if !apierrors.IsForbidden(watchErr) && !apierrors.IsUnauthorized(watchErr) {
59+
+ return false, watchErr
60+
+ }
61+
+ lastErr = watchErr
62+
+ log.Warn("(svcs) services watch auth error, retrying", "err", watchErr)
63+
+ return false, nil
64+
+ })
65+
+ if err != nil {
66+
+ return nil, fmt.Errorf("(svcs) services watch failed after retries: %w (last: %v)", err, lastErr)
67+
+ }
68+
+ return w, nil
69+
+}
70+
+
71+
// This function handles the watching of a services endpoints and updates a load balancers endpoint configurations accordingly
72+
func (p *Processor) ServicesWatcher(ctx context.Context, serviceFunc func(*servicecontext.Context, *v1.Service) error) error {
73+
// first start port mirroring if enabled
74+
@@ -44,7 +79,9 @@ func (p *Processor) ServicesWatcher(ctx context.Context, serviceFunc func(*servi
75+
// Use a restartable watcher, as this should help in the event of etcd or timeout issues
76+
rw, err := watchtools.NewRetryWatcherWithContext(ctx, "1", &cache.ListWatch{
77+
WatchFunc: func(_ metav1.ListOptions) (watch.Interface, error) {
78+
- return p.rwClientSet.CoreV1().Services(p.config.ServiceNamespace).Watch(ctx, metav1.ListOptions{})
79+
+ return watchWithAuthRetry(ctx, func(ctx context.Context) (watch.Interface, error) {
80+
+ return p.rwClientSet.CoreV1().Services(p.config.ServiceNamespace).Watch(ctx, metav1.ListOptions{})
81+
+ })
82+
},
83+
})
84+
if err != nil {
85+
--
86+
2.46.0
87+

0 commit comments

Comments
 (0)