Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 629fe20

Browse filesBrowse files
committed
devicemanager: Allow failing to allocate devices to be retried
This is a very coarse hack in the process of resolving #128043 I'm not happy with the retriable error concept, or necessarily sure about the exact paths that we want to retry. Very much open to changes here.
1 parent a0905af commit 629fe20
Copy full SHA for 629fe20

File tree

Expand file treeCollapse file tree

13 files changed

+125
-58
lines changed
Filter options
Expand file treeCollapse file tree

13 files changed

+125
-58
lines changed

‎pkg/kubelet/cm/admission/errors.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/admission/errors.go
+50-3Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,56 @@ func (e *unexpectedAdmissionError) Type() string {
4444
return ErrorReasonUnexpected
4545
}
4646

47-
func GetPodAdmitResult(err error) lifecycle.PodAdmitResult {
47+
type retryableError interface {
48+
Error() string
49+
Type() string
50+
IsRetryable() bool
51+
}
52+
53+
type retryableAdmissionError struct{ Err error }
54+
55+
var _ Error = (*retryableAdmissionError)(nil)
56+
57+
func (e *retryableAdmissionError) Error() string {
58+
return e.Err.Error()
59+
}
60+
61+
func (e *retryableAdmissionError) Type() string {
62+
return ErrorReasonUnexpected
63+
}
64+
65+
func (e *retryableAdmissionError) IsRetryable() bool {
66+
return true
67+
}
68+
69+
// NewRetryableAdmissionError indicates that an admission error may
70+
// be retried.
71+
// At the time of implementation, a single retry will be performed with a 5s delay.
72+
//
73+
// note: Initially this is only used by the devicemanager to account
74+
//
75+
// for node restarts and races with plugins coming up.
76+
// It is an unclean hack to reduce the scope of the topologymanager refactor.
77+
func NewRetryableAdmissionError(err error) Error {
78+
if err == nil {
79+
return nil
80+
}
81+
return &retryableAdmissionError{Err: err}
82+
}
83+
84+
func GetPodAdmitResult(err error) (lifecycle.PodAdmitResult, error) {
4885
if err == nil {
49-
return lifecycle.PodAdmitResult{Admit: true}
86+
return lifecycle.PodAdmitResult{Admit: true}, nil
87+
}
88+
89+
var retryableErr retryableError
90+
if errors.As(err, &retryableErr) && retryableErr.IsRetryable() {
91+
admissionErr := &unexpectedAdmissionError{err}
92+
return lifecycle.PodAdmitResult{
93+
Message: admissionErr.Error(),
94+
Reason: admissionErr.Type(),
95+
Admit: false,
96+
}, retryableErr
5097
}
5198

5299
var admissionErr Error
@@ -58,5 +105,5 @@ func GetPodAdmitResult(err error) lifecycle.PodAdmitResult {
58105
Message: admissionErr.Error(),
59106
Reason: admissionErr.Type(),
60107
Admit: false,
61-
}
108+
}, nil
62109
}

‎pkg/kubelet/cm/admission/errors_test.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/admission/errors_test.go
+15-1Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,26 +38,40 @@ func TestAdmissionErrors(t *testing.T) {
3838
testCases := []struct {
3939
Error error
4040
expectedAdmissionError bool
41+
expectedAdmissionFail bool
4142
}{
4243
{
4344
nil,
4445
false,
46+
false,
4547
},
4648
{
4749
errors.New("Not an AdmissionError error"),
4850
false,
51+
false,
4952
},
5053
{
5154
&TestAdmissionError{
5255
"Is an AdmissionError error",
5356
"TestAdmissionError",
5457
},
5558
true,
59+
false,
60+
},
61+
{
62+
NewRetryableAdmissionError(errors.New("Retryable error")),
63+
false,
64+
true,
5665
},
5766
}
5867

5968
for _, tc := range testCases {
60-
h := GetPodAdmitResult(tc.Error)
69+
h, err := GetPodAdmitResult(tc.Error)
70+
if tc.expectedAdmissionFail {
71+
if err == nil {
72+
t.Errorf("expected GetPodAdmitResult to return an error")
73+
}
74+
}
6175
if tc.Error == nil {
6276
if !h.Admit {
6377
t.Errorf("expected PodAdmitResult.Admit = true")

‎pkg/kubelet/cm/devicemanager/manager.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/devicemanager/manager.go
+6-3Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"k8s.io/kubernetes/pkg/features"
4242
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
4343
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
44+
"k8s.io/kubernetes/pkg/kubelet/cm/admission"
4445
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
4546
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
4647
plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1"
@@ -611,12 +612,14 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
611612
// Note: we need to check the device health and registration status *before* we check how many devices are needed, doing otherwise caused issue #109595
612613
// Note: if the scheduler is bypassed, we fall back in scenario 1, so we still need these checks.
613614
if !hasRegistered {
614-
return nil, fmt.Errorf("cannot allocate unregistered device %s", resource)
615+
return nil, admission.NewRetryableAdmissionError(fmt.Errorf("cannot allocate unregistered device %s", resource))
615616
}
616617

617-
// Check if registered resource has healthy devices
618+
// Check if registered resource has healthy devices - this is often empty after
619+
// a node reboot before a provider has fully initialized devices but is already
620+
// running.
618621
if healthyDevices.Len() == 0 {
619-
return nil, fmt.Errorf("no healthy devices present; cannot allocate unhealthy devices %s", resource)
622+
return nil, admission.NewRetryableAdmissionError(fmt.Errorf("no healthy devices present; cannot allocate unhealthy devices %s", resource))
620623
}
621624

622625
// Check if all the previously allocated devices are healthy

‎pkg/kubelet/cm/devicemanager/manager_test.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/devicemanager/manager_test.go
+8-9Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,15 +1176,14 @@ func TestPodContainerDeviceToAllocate(t *testing.T) {
11761176
}
11771177

11781178
for _, testCase := range testCases {
1179-
allocDevices, err := testManager.devicesToAllocate(testCase.podUID, testCase.contName, testCase.resource, testCase.required, testCase.reusableDevices)
1180-
if !reflect.DeepEqual(err, testCase.expErr) {
1181-
t.Errorf("devicePluginManager error (%v). expected error: %v but got: %v",
1182-
testCase.description, testCase.expErr, err)
1183-
}
1184-
if !reflect.DeepEqual(allocDevices, testCase.expectedAllocatedDevices) {
1185-
t.Errorf("devicePluginManager error (%v). expected error: %v but got: %v",
1186-
testCase.description, testCase.expectedAllocatedDevices, allocDevices)
1187-
}
1179+
t.Run(testCase.description, func(t *testing.T) {
1180+
allocDevices, err := testManager.devicesToAllocate(testCase.podUID, testCase.contName, testCase.resource, testCase.required, testCase.reusableDevices)
1181+
require.EqualError(t, err, testCase.expErr.Error())
1182+
if !reflect.DeepEqual(allocDevices, testCase.expectedAllocatedDevices) {
1183+
t.Errorf("devicePluginManager error (%v). expected error: %v but got: %v",
1184+
testCase.description, testCase.expectedAllocatedDevices, allocDevices)
1185+
}
1186+
})
11881187
}
11891188

11901189
}

‎pkg/kubelet/cm/topologymanager/fake_topology_manager.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/topologymanager/fake_topology_manager.go
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,5 +79,5 @@ func (m *fakeManager) RemoveContainer(containerID string) error {
7979

8080
func (m *fakeManager) Admit(attrs *lifecycle.PodAdmitAttributes) (lifecycle.PodAdmitResult, error) {
8181
klog.InfoS("Topology Admit Handler")
82-
return admission.GetPodAdmitResult(nil), nil
82+
return admission.GetPodAdmitResult(nil)
8383
}

‎pkg/kubelet/cm/topologymanager/fake_topology_manager_test.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/topologymanager/fake_topology_manager_test.go
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import (
2020
"reflect"
2121
"testing"
2222

23-
"k8s.io/api/core/v1"
23+
v1 "k8s.io/api/core/v1"
2424
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
2525
)
2626

@@ -116,7 +116,7 @@ func TestFakeAdmit(t *testing.T) {
116116
pod := v1.Pod{}
117117
pod.Status.QOSClass = tc.qosClass
118118
podAttr.Pod = &pod
119-
actual := fm.Admit(&podAttr)
119+
actual, _ := fm.Admit(&podAttr)
120120
if reflect.DeepEqual(actual, tc.result) {
121121
t.Errorf("Error occurred, expected Admit in result to be %v got %v", tc.result, actual.Admit)
122122
}

‎pkg/kubelet/cm/topologymanager/scope.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/topologymanager/scope.go
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,10 +139,10 @@ func (s *scope) admitPolicyNone(pod *v1.Pod) (lifecycle.PodAdmitResult, error) {
139139
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
140140
err := s.allocateAlignedResources(pod, &container)
141141
if err != nil {
142-
return admission.GetPodAdmitResult(err), nil
142+
return admission.GetPodAdmitResult(err)
143143
}
144144
}
145-
return admission.GetPodAdmitResult(nil), nil
145+
return admission.GetPodAdmitResult(nil)
146146
}
147147

148148
// It would be better to implement this function in topologymanager instead of scope

‎pkg/kubelet/cm/topologymanager/scope_container.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/topologymanager/scope_container.go
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,23 +54,23 @@ func (s *containerScope) Admit(pod *v1.Pod) (lifecycle.PodAdmitResult, error) {
5454
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
5555
}
5656
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
57-
return admission.GetPodAdmitResult(&TopologyAffinityError{}), nil
57+
return admission.GetPodAdmitResult(&TopologyAffinityError{})
5858
}
5959
klog.InfoS("Topology Affinity", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
6060
s.setTopologyHints(string(pod.UID), container.Name, bestHint)
6161

6262
err := s.allocateAlignedResources(pod, &container)
6363
if err != nil {
6464
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
65-
return admission.GetPodAdmitResult(err), nil
65+
return admission.GetPodAdmitResult(err)
6666
}
6767

6868
if IsAlignmentGuaranteed(s.policy) {
6969
klog.V(4).InfoS("Resource alignment at container scope guaranteed", "pod", klog.KObj(pod))
7070
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
7171
}
7272
}
73-
return admission.GetPodAdmitResult(nil), nil
73+
return admission.GetPodAdmitResult(nil)
7474
}
7575

7676
func (s *containerScope) accumulateProvidersHints(pod *v1.Pod, container *v1.Container) []map[string][]TopologyHint {

‎pkg/kubelet/cm/topologymanager/scope_pod.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/topologymanager/scope_pod.go
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func (s *podScope) Admit(pod *v1.Pod) (lifecycle.PodAdmitResult, error) {
5353
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
5454
}
5555
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
56-
return admission.GetPodAdmitResult(&TopologyAffinityError{}), nil
56+
return admission.GetPodAdmitResult(&TopologyAffinityError{})
5757
}
5858

5959
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
@@ -63,15 +63,15 @@ func (s *podScope) Admit(pod *v1.Pod) (lifecycle.PodAdmitResult, error) {
6363
err := s.allocateAlignedResources(pod, &container)
6464
if err != nil {
6565
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
66-
return admission.GetPodAdmitResult(err), nil
66+
return admission.GetPodAdmitResult(err)
6767
}
6868
}
6969
if IsAlignmentGuaranteed(s.policy) {
7070
// increment only if we know we allocate aligned resources.
7171
klog.V(4).InfoS("Resource alignment at pod scope guaranteed", "pod", klog.KObj(pod))
7272
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
7373
}
74-
return admission.GetPodAdmitResult(nil), nil
74+
return admission.GetPodAdmitResult(nil)
7575
}
7676

7777
func (s *podScope) accumulateProvidersHints(pod *v1.Pod) []map[string][]TopologyHint {

‎pkg/kubelet/cm/topologymanager/topology_manager_test.go

Copy file name to clipboardExpand all lines: pkg/kubelet/cm/topologymanager/topology_manager_test.go
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import (
2121
"strings"
2222
"testing"
2323

24-
"k8s.io/api/core/v1"
24+
v1 "k8s.io/api/core/v1"
2525

2626
cadvisorapi "github.com/google/cadvisor/info/v1"
2727

@@ -563,7 +563,7 @@ func TestAdmit(t *testing.T) {
563563
}
564564

565565
// Container scope Admit
566-
ctnActual := ctnScopeManager.Admit(&podAttr)
566+
ctnActual, _ := ctnScopeManager.Admit(&podAttr)
567567
if ctnActual.Admit != tc.expected {
568568
t.Errorf("Error occurred, expected Admit in result to be %v got %v", tc.expected, ctnActual.Admit)
569569
}
@@ -572,7 +572,7 @@ func TestAdmit(t *testing.T) {
572572
}
573573

574574
// Pod scope Admit
575-
podActual := podScopeManager.Admit(&podAttr)
575+
podActual, _ := podScopeManager.Admit(&podAttr)
576576
if podActual.Admit != tc.expected {
577577
t.Errorf("Error occurred, expected Admit in result to be %v got %v", tc.expected, podActual.Admit)
578578
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.