platform/k8s/bases/infrastructure/cluster-policies/kustomization.yaml at main · devantler-tech/platform · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
  # Custom policies (no upstream equivalent)
  - best-practices/add-default-deny.yaml
  - best-practices/add-default-limitrange.yaml
  - best-practices/add-resource-defaults.yaml
  - best-practices/add-security-context.yaml
  - best-practices/auto-vpa.yaml
  - best-practices/disable-default-sa-automount.yaml
  - best-practices/disallow-latest-tag.yaml
  - best-practices/propagate-reloader-to-flagger-primary.yaml
  - best-practices/restrict-tenant-secret-stores.yaml
  - best-practices/validate-host-restrictions.yaml
  - best-practices/validate-pdb-drain-safe.yaml
  - best-practices/validate-pod-security.yaml
  - best-practices/validate-replica-ceiling.yaml
  - best-practices/validate-replica-floor.yaml
  - best-practices/verify-image-signatures.yaml
  - flux/enforce-flux-best-practices.yaml
  - flux/helm-release-enable-tests.yaml
  - flux/helm-release-install-crds.yaml
  - flux/helm-release-remediation-retries.yaml
  # Upstream kyverno/policies (synced via reusable-workflows, patched below)
  - samples/best-practices/add-ns-quota/add-ns-quota.yaml
  - samples/other/create-pod-antiaffinity/create-pod-antiaffinity.yaml
  - samples/other/spread-pods-across-topology/spread-pods-across-topology.yaml
patches:
  # --- add-ns-quota: exclude system namespaces, set generateExisting, tune resource values ---
  - target:
      kind: ClusterPolicy
      name: add-ns-quota
    patch: |
      - op: add
        path: /spec/rules/0/exclude
        value:
          any:
            - resources:
                names:
                  - kube-system
                  - kube-public
                  - kube-node-lease
                  - flux-system
                  # Platform observability stack. coroot runs the eBPF
                  # node-agent DaemonSet, which must schedule a pod on EVERY
                  # node -- including autoscaler-provisioned ones -- and carries
                  # a VPA-governed footprint that legitimately exceeds the
                  # generic tenant quota. A hard requests.memory ResourceQuota
                  # here rejects those DaemonSet pods on any new node, which
                  # stalls the coroot-operator and the infrastructure-controllers
                  # Flux Kustomization cluster-wide. Exempt it like the other
                  # platform-owned namespaces; its sizing is VPA-owned, not
                  # quota-bounded. It still gets the default LimitRange from the
                  # add-default-limitrange policy.
                  - observability
                  # Longhorn storage data plane. longhorn-manager creates the
                  # instance-manager pods WITHOUT a memory limit by design: during
                  # a volume rebuild they burst well past the generic LimitRange
                  # default (memory 512Mi), get OOMKilled, and longhorn-manager
                  # then deletes+recreates the IM pod -- which faults EVERY replica
                  # engine on that node (DetachedUnexpectedly), cascading into
                  # CNPG primary failover and Postgres timeline divergence
                  # cluster-wide (observed 2026-06-20). It is excluded from BOTH
                  # the ResourceQuota (here) and the LimitRange
                  # (add-default-limitrange drops the OOM-inducing limit by
                  # excluding it too); this rule must drop it as well -- otherwise
                  # pods that no longer receive LimitRange-supplied requests get
                  # rejected by the requests.memory quota. Longhorn sizing is
                  # operator-/VPA-owned (manager, csi, ui carry VPAs), not
                  # quota-bounded.
                  - longhorn-system
                  # Velero backup data plane. Velero runs a kopia
                  # repository-maintenance Job per backup repo (one per backed-up
                  # namespace, ~15), keeps the last few, and recreates them every
                  # maintenance cycle -- a constant churn of short-lived pods. The
                  # Kubernetes ResourceQuota controller LEAKS requests.memory usage
                  # under that churn (kubernetes#118946): completed pods are dropped
                  # from accounting unreliably, so status.used.requests.memory drifts
                  # upward unbounded -- observed 16300Mi "used" against this 16Gi cap
                  # while only ~430Mi of pods were actually non-terminal. Once the
                  # phantom usage reaches the cap the quota admission webhook rejects
                  # every new maintenance/backup pod ("exceeded quota:
                  # default-resourcequota, requested: requests.memory=128Mi"), which
                  # silently halts repository maintenance and kopia/restic backups --
                  # a data-protection outage with no failing Deployment to alert on.
                  # Velero's real footprint is the node-agent DaemonSet (one per
                  # node) plus these ephemeral Jobs, not steady-state tenant
                  # workloads, so a static per-namespace memory quota is the wrong
                  # tool and actively harmful here. Exempt it like the other
                  # platform-owned namespaces; it still gets the default LimitRange
                  # from add-default-limitrange, so backup pods keep sane defaults.
                  - velero
      - op: add
        path: /spec/rules/0/generate/generateExisting
        value: true
      - op: replace
        path: /spec/rules/0/generate/data/spec/hard
        value:
          # Quota only the REQUESTS dimension. auto-vpa.yaml runs VPA with
          # controlledValues: RequestsAndLimits, so VPA raises container LIMITS
          # dynamically (ratio-preserving). A static limits.cpu/limits.memory
          # quota would then reject the very rollouts VPA triggers once limits
          # rise -- an unrecoverable evict-then-can't-recreate loop -- and
          # monitoring already sat at 96% of limits.cpu=16. requests.* still
          # govern aggregate reserved capacity (each pod's request is bounded by
          # VPA maxAllowed), which is the dimension that actually reserves node
          # resources; limits are burst ceilings VPA now owns per pod.
          requests.cpu: "8"
          requests.memory: 16Gi
      # Remove the upstream sample's LimitRange rule. It is re-homed in the
      # standalone add-default-limitrange policy so the ResourceQuota and the
      # LimitRange never share one policy: Kyverno evaluates EVERY rule of a
      # policy against any trigger matching ANY rule (one shared UpdateRequest),
      # so while the LimitRange rule (which does not exclude observability/velero)
      # lived here next to this ResourceQuota rule (which does), this rule fired
      # against those two namespaces and emitted "generate-resourcequota error:
      # policy does not apply to resource" PolicyError events every reconcile.
      # Removing a generate rule leaves its already-generated LimitRanges in
      # place (stale, not deleted -- verified via the validate-policy webhook),
      # so add-default-limitrange adopts them with no gap.
      - op: remove
        path: /spec/rules/1
      # Skip namespaces that are being deleted. A terminating namespace (e.g.
      # cdi/kubevirt after the prod disable) rejects new ResourceQuota/LimitRange
      # with "forbidden: namespace is being terminated", which Kyverno surfaces
      # as PolicyError Warning events that trip the merge-queue event gate.
      - op: add
        path: /spec/rules/0/preconditions
        value:
          all:
            - key: "{{ request.object.metadata.deletionTimestamp || '' }}"
              operator: Equals
              value: ""
  # --- Anti-affinity: add StatefulSet match, use app.kubernetes.io/name, exclude kube-system ---
  - target:
      kind: ClusterPolicy
      name: insert-pod-antiaffinity
    patch: |
      - op: replace
        path: /spec/rules/0/match/any
        value:
          - resources:
              kinds:
                - Deployment
                - StatefulSet
      - op: add
        path: /spec/rules/0/exclude
        value:
          resources:
            namespaces:
              - kube-system
      - op: replace
        path: /spec/rules/0/preconditions/all/0/key
        value: "{{request.object.spec.template.metadata.labels.\"app.kubernetes.io/name\" || ''}}"
      - op: replace
        path: /spec/rules/0/mutate/patchStrategicMerge/spec/template/spec/+(affinity)/+(podAntiAffinity)/+(preferredDuringSchedulingIgnoredDuringExecution)/0/podAffinityTerm/labelSelector/matchExpressions/0/key
        value: "app.kubernetes.io/name"
      - op: replace
        path: /spec/rules/0/mutate/patchStrategicMerge/spec/template/spec/+(affinity)/+(podAntiAffinity)/+(preferredDuringSchedulingIgnoredDuringExecution)/0/podAffinityTerm/labelSelector/matchExpressions/0/values/0
        value: "{{request.object.spec.template.metadata.labels.\"app.kubernetes.io/name\"}}"
  # --- Topology spread: remove label gate, add StatefulSet, hostname key, DoNotSchedule, exclude kube-system ---
  - target:
      kind: ClusterPolicy
      name: spread-pods
    patch: |
      - op: replace
        path: /spec/rules/0/match/any
        value:
          - resources:
              kinds:
                - Deployment
                - StatefulSet
      - op: add
        path: /spec/rules/0/exclude
        value:
          resources:
            namespaces:
              - kube-system
      - op: add
        path: /spec/rules/0/preconditions
        value:
          all:
            - key: "{{request.object.spec.template.metadata.labels.\"app.kubernetes.io/name\" || ''}}"
              operator: NotEquals
              value: ""
      - op: replace
        path: /spec/rules/0/mutate/patchStrategicMerge/spec/template/spec/+(topologySpreadConstraints)/0/topologyKey
        value: "kubernetes.io/hostname"
      - op: replace
        path: /spec/rules/0/mutate/patchStrategicMerge/spec/template/spec/+(topologySpreadConstraints)/0/whenUnsatisfiable
        value: "DoNotSchedule"
      # Scope skew per ReplicaSet revision so rolling updates don't deadlock under
      # DoNotSchedule (surge pods of the new revision would otherwise count against
      # the old revision's skew). Requires k8s >=1.27 (GA in 1.34). For StatefulSets
      # the pod-template-hash key is absent, so k8s ignores it and falls back to the
      # labelSelector — fine, since StatefulSet updates roll one pod at a time.
      - op: add
        path: /spec/rules/0/mutate/patchStrategicMerge/spec/template/spec/+(topologySpreadConstraints)/0/matchLabelKeys
        value:
          - pod-template-hash
      - op: replace
        path: /spec/rules/0/mutate/patchStrategicMerge/spec/template/spec/+(topologySpreadConstraints)/0/labelSelector
        value:
          matchExpressions:
            - key: "app.kubernetes.io/name"
              operator: In
              values:
                - "{{request.object.spec.template.metadata.labels.\"app.kubernetes.io/name\"}}"