Skip to content

Fix controller-manager failures causing CNS to remain incorrectly healthy #3688

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions cns/service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,7 @@

// Check the CNI statefile mount, and if the file is empty
// stub an empty JSON object
if err := cnireconciler.WriteObjectToCNIStatefile(); err != nil {

Check failure on line 839 in cns/service/main.go

View workflow job for this annotation

GitHub Actions / Lint (1.22.x, ubuntu-latest)

shadow: declaration of "err" shadows declaration at line 532 (govet)

Check failure on line 839 in cns/service/main.go

View workflow job for this annotation

GitHub Actions / Lint (1.22.x, windows-latest)

shadow: declaration of "err" shadows declaration at line 532 (govet)
logger.Errorf("Failed to write empty object to CNI state: %v", err)
return
}
Expand Down Expand Up @@ -1372,7 +1372,7 @@
}

// InitializeCRDState builds and starts the CRD controllers.
func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cnsconfig *configuration.CNSConfig) error {

Check failure on line 1375 in cns/service/main.go

View workflow job for this annotation

GitHub Actions / Lint (1.22.x, ubuntu-latest)

cyclomatic complexity 42 of func `InitializeCRDState` is high (> 30) (gocyclo)

Check failure on line 1375 in cns/service/main.go

View workflow job for this annotation

GitHub Actions / Lint (1.22.x, windows-latest)

cyclomatic complexity 42 of func `InitializeCRDState` is high (> 30) (gocyclo)
// convert interface type to implementation type
httpRestServiceImplementation, ok := httpRestService.(*restserver.HTTPRestService)
if !ok {
Expand Down Expand Up @@ -1609,20 +1609,20 @@
// Start the Manager which starts the reconcile loop.
// The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the
// Monitor's internal loop.
managerErrCh := make(chan error, 1)
go func() {
logger.Printf("Starting controller-manager.")
for {
if err := manager.Start(ctx); err != nil {
logger.Errorf("Failed to start controller-manager: %v", err)
// retry to start the request controller
// inc the managerStartFailures metric for failure tracking
managerStartFailures.Inc()
} else {
logger.Printf("Stopped controller-manager.")
return
}
time.Sleep(time.Second) // TODO(rbtr): make this exponential backoff
// Add timeout for controller startup
managerStartTimeout := 5 * time.Minute
startManagerCtx, startManagerCancel := context.WithTimeout(ctx, managerStartTimeout)
defer startManagerCancel()
if err := manager.Start(startManagerCtx); err != nil {
logger.Errorf("Failed to start controller-manager: %v", err)
managerErrCh <- err
return
}
logger.Printf("Stopped controller-manager.")
managerErrCh <- nil
}()
logger.Printf("Initialized controller-manager.")
for {
Expand All @@ -1633,11 +1633,14 @@
nncReadyCtx, cancel := context.WithTimeout(ctx, 15*time.Minute) // nolint // it will time out and not leak
if started, err := nncReconciler.Started(nncReadyCtx); !started {
logger.Errorf("NNC reconciler has not started, does the NNC exist? err: %v", err)
nncReconcilerStartFailures.Inc()
continue
}
logger.Printf("NodeNetworkConfig reconciler has started.")
cancel()
err := <-managerErrCh
if err != nil {
return errors.Wrap(err, "controller-manager failed")
}
break
}

Expand Down
Loading