Skip to content

Commit a6b6730

Browse files
rajaSahilkron4eg
andauthored
Fix reset failing to delete PVs blocked by PDBs and respawned pods (#4100)
* fix volume destroy for kubeone reset Signed-off-by: rajaSahil <sahilraja242@gmail.com> * Make setup-go use the latest and greatest Go version Signed-off-by: Artiom Diomin <artiom@kubermatic.com> * Don't use cache in actions/setup-go Signed-off-by: Artiom Diomin <artiom@kubermatic.com> * use go-version: 1.26 in GH action setup-go Signed-off-by: Artiom Diomin <artiom@kubermatic.com> --------- Signed-off-by: rajaSahil <sahilraja242@gmail.com> Signed-off-by: Artiom Diomin <artiom@kubermatic.com> Co-authored-by: Artiom Diomin <artiom@kubermatic.com>
1 parent 31fb546 commit a6b6730

7 files changed

Lines changed: 51 additions & 18 deletions

File tree

.github/workflows/golangci-lint.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ jobs:
1616
- uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
1717
with:
1818
go-version: 1.26
19-
cache: true
2019

2120
- name: golangci-lint
2221
uses: golangci/golangci-lint-action@1e7e51e771db61008b38414a730f564565cf7c20 # v9.2.0

.github/workflows/release.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ jobs:
2727
- uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
2828
with:
2929
go-version: 1.26
30-
cache: true
3130

3231
- name: fetch default stable for Kubernetes
3332
run: |

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module k8c.io/kubeone
22

3-
go 1.26.1
3+
go 1.26
44

55
require (
66
dario.cat/mergo v1.0.1

pkg/clientutil/service.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ func CleanupLBs(ctx context.Context, logger logrus.FieldLogger, c client.Client)
3636
// Block service creation so gateway/other controllers can't recreate LB services
3737
// while we're deleting them.
3838
logger.Infoln("Creating ValidatingWebhookConfiguration to disable future Service creation...")
39-
if err := creationPreventingWebhook(ctx, c, "", LBResources); err != nil {
39+
if err := creationPreventingWebhook(ctx, c, "", LBResources, nil); err != nil {
4040
return fail.KubeClient(err, "disabling future Service creation")
4141
}
4242

pkg/clientutil/volumes.go

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"k8c.io/kubeone/pkg/fail"
2828

2929
corev1 "k8s.io/api/core/v1"
30+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031
"k8s.io/apimachinery/pkg/util/wait"
3132
"k8s.io/client-go/kubernetes"
3233
"k8s.io/client-go/rest"
@@ -45,13 +46,25 @@ const (
4546

4647
var VolumeResources = []string{"persistentvolumes", "persistentvolumeclaims"}
4748

49+
// PodResources is the list of resources blocked by the Pod creation-preventing webhook.
50+
var PodResources = []string{"pods"}
51+
4852
func CleanupUnretainedVolumes(ctx context.Context, logger logrus.FieldLogger, c client.Client, restConfig *rest.Config) error {
4953
// We disable the PV & PVC creation so nothing creates new PV's while we delete them
5054
logger.Infoln("Creating ValidatingWebhookConfiguration to disable future PV & PVC creation...")
5155
if err := disablePVCreation(ctx, c); err != nil {
5256
return fail.KubeClient(err, "disabling future PV & PVC creation.")
5357
}
5458

59+
// We also disable Pod creation so workload controllers (Deployments, StatefulSets, ...) can't
60+
// recreate the Pods we delete below. A recreated Pod would re-mount the PVC and keep the
61+
// kubernetes.io/pvc-protection finalizer, leaving the PVC stuck in Terminating. The webhook
62+
// excludes kube-system so the CSI controllers keep running to actually delete the cloud volumes.
63+
logger.Infoln("Creating ValidatingWebhookConfiguration to disable future Pod creation...")
64+
if err := disablePodCreation(ctx, c); err != nil {
65+
return fail.KubeClient(err, "disabling future Pod creation.")
66+
}
67+
5568
pvcList, pvList, err := getDynamicallyProvisionedUnretainedPvs(ctx, c)
5669
if err != nil {
5770
return err
@@ -91,7 +104,24 @@ func CleanupUnretainedVolumes(ctx context.Context, logger logrus.FieldLogger, c
91104

92105
func disablePVCreation(ctx context.Context, c client.Client) error {
93106
// Prevent re-creation of PVs and PVCs by using an intentionally defunct admissionWebhook
94-
return creationPreventingWebhook(ctx, c, "", VolumeResources)
107+
return creationPreventingWebhook(ctx, c, "", VolumeResources, nil)
108+
}
109+
110+
func disablePodCreation(ctx context.Context, c client.Client) error {
111+
// Prevent re-creation of Pods by using an intentionally defunct admissionWebhook. kube-system is
112+
// excluded so the CSI controllers (and other system components) keep running to delete the cloud
113+
// volumes backing the PVCs we remove.
114+
namespaceSelector := &metav1.LabelSelector{
115+
MatchExpressions: []metav1.LabelSelectorRequirement{
116+
{
117+
Key: corev1.LabelMetadataName,
118+
Operator: metav1.LabelSelectorOpNotIn,
119+
Values: []string{metav1.NamespaceSystem},
120+
},
121+
},
122+
}
123+
124+
return creationPreventingWebhook(ctx, c, "", PodResources, namespaceSelector)
95125
}
96126

97127
func cleanupPVCUsingPods(ctx context.Context, c client.Client, log logrus.FieldLogger, kubeClient *kubernetes.Clientset) error {
@@ -100,11 +130,11 @@ func cleanupPVCUsingPods(ctx context.Context, c client.Client, log logrus.FieldL
100130
return fail.KubeClient(err, "listing Pods from user cluster.")
101131
}
102132

103-
var pvUsingPods []*corev1.Pod
133+
var pvUsingPods []corev1.Pod
104134
for idx := range podList.Items {
105135
pod := &podList.Items[idx]
106136
if podUsesPV(pod) {
107-
pvUsingPods = append(pvUsingPods, pod)
137+
pvUsingPods = append(pvUsingPods, *pod)
108138
}
109139
}
110140

@@ -119,6 +149,10 @@ func cleanupPVCUsingPods(ctx context.Context, c client.Client, log logrus.FieldL
119149
// ReplicaSet)
120150
Force: true,
121151
DeleteEmptyDirData: true,
152+
// DisableEviction makes drain delete pods directly instead of using the
153+
// eviction API. The cluster is being torn down, so PodDisruptionBudgets
154+
// must not be allowed to block (and indefinitely fail) the deletion.
155+
DisableEviction: true,
122156
GracePeriodSeconds: -1,
123157
Out: os.Stdout,
124158
ErrOut: os.Stdout,
@@ -135,16 +169,9 @@ func cleanupPVCUsingPods(ctx context.Context, c client.Client, log logrus.FieldL
135169
log.Infof("pod %s/%s is %s", pod.GetNamespace(), pod.GetName(), evicted)
136170
},
137171
}
138-
evictionGroupVersion, err := drain.CheckEvictionSupport(kubeClient)
139-
if err != nil {
140-
return err
141-
}
142172

143-
for _, pod := range pvUsingPods {
144-
err := helper.EvictPod(*pod, evictionGroupVersion)
145-
if err != nil {
146-
return fail.KubeClient(err, "deleting the pod.")
147-
}
173+
if err := helper.DeleteOrEvictPods(pvUsingPods); err != nil {
174+
return fail.KubeClient(err, "deleting the pods using PVs.")
148175
}
149176

150177
return nil

pkg/clientutil/webhook.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,15 @@ import (
2323
"k8c.io/kubeone/pkg/fail"
2424

2525
admissionregistrationv1 "k8s.io/api/admissionregistration/v1"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2627
"k8s.io/apimachinery/pkg/types"
2728
"sigs.k8s.io/controller-runtime/pkg/client"
2829
)
2930

3031
// creationPreventingWebhook returns a ValidatingWebhookConfiguration that is intentionally defunct
31-
// and will prevent all creation requests from succeeding.
32-
func creationPreventingWebhook(ctx context.Context, c client.Client, apiGroup string, resources []string) error {
32+
// and will prevent all creation requests from succeeding. An optional namespaceSelector limits which
33+
// namespaces the webhook applies to (e.g. to keep CSI controllers in kube-system running during cleanup).
34+
func creationPreventingWebhook(ctx context.Context, c client.Client, apiGroup string, resources []string, namespaceSelector *metav1.LabelSelector) error {
3335
failurePolicy := admissionregistrationv1.Fail
3436
sideEffects := admissionregistrationv1.SideEffectClassNone
3537
vwc := admissionregistrationv1.ValidatingWebhookConfiguration{}
@@ -62,6 +64,7 @@ func creationPreventingWebhook(ctx context.Context, c client.Client, apiGroup st
6264
},
6365
},
6466
}
67+
vwc.Webhooks[0].NamespaceSelector = namespaceSelector
6568
vwc.Webhooks[0].FailurePolicy = &failurePolicy
6669
vwc.Webhooks[0].SideEffects = &sideEffects
6770
vwc.Webhooks[0].AdmissionReviewVersions = []string{"v1"}

pkg/tasks/reset.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ func RemoveVolumes(s *state.State) error {
7373
"kubernetes-cluster-cleanup-"+strings.Join(clientutil.VolumeResources, "-")); err != nil {
7474
s.Logger.Warn("Unable to delete ValidatingWebhookConfiguration.")
7575
}
76+
s.Logger.Infoln("Deleting ValidatingWebhookConfiguration to enable future Pod creation...")
77+
if err := clientutil.DeletePreventingWebhook(s.Context, s.DynamicClient,
78+
"kubernetes-cluster-cleanup-"+strings.Join(clientutil.PodResources, "-")); err != nil {
79+
s.Logger.Warn("Unable to delete ValidatingWebhookConfiguration.")
80+
}
7681

7782
return lastErr
7883
}

0 commit comments

Comments
 (0)