Skip to content

Commit 9a140e2

Browse files
committed
collector process removes pkg status when failure occurs
1 parent b1b0e66 commit 9a140e2

2 files changed

Lines changed: 31 additions & 27 deletions

File tree

opamp/observiq/observiq_client.go

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"net/http"
2424
"net/url"
25+
"os"
2526
"slices"
2627
"sync"
2728
"time"
@@ -318,33 +319,43 @@ func (c *Client) onConnectHandler(_ context.Context) {
318319
return
319320
}
320321

321-
// See if we can retrieve the PackageStatuses where the collector package is in an installing state
322322
pkgStatuses, err := c.getVerifiedPackageStatuses()
323323
if err != nil {
324324
c.logger.Error("Problem with PackageStatuses", zap.Error(err))
325325
return
326326
}
327-
328327
collectorPkgStatus := pkgStatuses.Packages[packagestate.CollectorPackageName]
329-
// If we were not installing before the connection, nothing else to do
330-
if collectorPkgStatus.Status != protobufs.PackageStatusEnum_PackageStatusEnum_Installing {
331-
return
332-
}
333328

334-
// If in the middle of an install and we just connected, this is most likely becasue the collector was just spun up fresh by the Updater.
329+
// If in the middle of an install and we just connected, this is most likely because the collector was just spun up fresh by the Updater.
335330
// If the current version matches the server offered version, this implies a good install and so we should set the PackageStatuses and
336331
// send it to the OpAMP Server. If the version does not match, just change the PackageStatues JSON so that the Updater can start rollback.
332+
if collectorPkgStatus.Status == protobufs.PackageStatusEnum_PackageStatusEnum_Installing {
333+
if collectorPkgStatus.ServerOfferedVersion != version.Version() {
334+
errMsg := fmt.Sprintf("Failed because of collector version mismatch: expected %s, actual %s",
335+
collectorPkgStatus.ServerOfferedVersion, version.Version())
336+
c.failPackageInstall(pkgStatuses, errMsg, false)
337337

338-
if collectorPkgStatus.ServerOfferedVersion != version.Version() {
339-
errMsg := fmt.Sprintf("Failed because of collector version mismatch: expected %s, actual %s",
340-
collectorPkgStatus.ServerOfferedVersion, version.Version())
341-
c.failPackageInstall(pkgStatuses, errMsg, false)
338+
return
339+
}
342340

341+
// Installation of new collector was successful!
342+
c.finishPackageInstall(pkgStatuses)
343343
return
344344
}
345345

346-
// Installation of new collector was successful!
347-
c.finishPackageInstall(pkgStatuses)
346+
// If the package status is "failed", this most likely means an update was attempted and the new binary failed.
347+
// The Updater has removed the new binary, rolled back the old binary, and re-started it. The current process
348+
// is the old binary that was restored. At this point the OpAMP client has already initialized itself with the
349+
// status contained in the package state file as part of the client connection process. We can safely delete
350+
// the package state file, however we cannot update the stored state on the client (i.e. c.opampClient.SetPackageStatuses(pkgStatuses)).
351+
// Doing so would wipe the state before the client can report the failure. Deleting just the file though is safe. After the initial
352+
// state is reported as part of the connection process, future queries of this file for package state will return a default "installed" status.
353+
// This is good because after reporting the initial error, we don't want to persist an upgrade failure.
354+
if collectorPkgStatus.Status == protobufs.PackageStatusEnum_PackageStatusEnum_InstallFailed {
355+
if err := os.Remove(packagestate.DefaultFileName); err != nil {
356+
c.logger.Warn("Failed to remove package status artifact", zap.Error(err))
357+
}
358+
}
348359
}
349360

350361
func (c *Client) onConnectFailedHandler(_ context.Context, err error) {

updater/internal/updater/updater.go

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,6 @@ func (u *Updater) generateLinuxServiceFiles() error {
152152

153153
// Update performs the update of the collector binary
154154
func (u *Updater) Update() error {
155-
// Ensure we clean up the package status artifact after any update state
156-
defer u.removePackageStatusArtifact()
157-
158155
// Generate service files before stopping the service. If
159156
// this fails, the collector will still be running.
160157
if runtime.GOOS == "linux" {
@@ -232,6 +229,13 @@ func (u *Updater) Update() error {
232229
return fmt.Errorf("failed while monitoring for success: %w", err)
233230
}
234231

232+
// Remove package status artifact
233+
if err := os.Remove(packagestate.DefaultFileName); err != nil {
234+
u.logger.Error("Failed to remove package status artifact", zap.Error(err))
235+
} else {
236+
u.logger.Debug("Package status artifact removed")
237+
}
238+
235239
u.logger.Info("Update Complete")
236240
return nil
237241
}
@@ -243,14 +247,3 @@ func (u *Updater) removeTmpDir() {
243247
u.logger.Error("failed to remove temporary directory", zap.Error(err))
244248
}
245249
}
246-
247-
// removePackageStatusArtifact removes the default package status artifact.
248-
// if left on disk then the collectors will get confused about state and may think they've already updated
249-
func (u *Updater) removePackageStatusArtifact() {
250-
err := os.Remove(packagestate.DefaultFileName)
251-
if err != nil {
252-
u.logger.Error("failed to remove package status artifact", zap.Error(err))
253-
return
254-
}
255-
u.logger.Debug("package status artifact removed")
256-
}

0 commit comments

Comments
 (0)