Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
Signed-off-by: cardyok <[email protected]>
  • Loading branch information
cardyok committed Jan 24, 2025
1 parent 84ad45c commit f4a90c6
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 520 deletions.
35 changes: 19 additions & 16 deletions components/accelerator/nvidia/error/xid/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,25 @@ import (
"github.com/leptonai/gpud/components"
nvidia_common "github.com/leptonai/gpud/components/accelerator/nvidia/common"
nvidia_component_error_xid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/id"
"github.com/leptonai/gpud/components/accelerator/nvidia/error/xid/store"
nvidia_query_xid "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid"
"github.com/leptonai/gpud/components/db"
os_id "github.com/leptonai/gpud/components/os/id"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"
pkg_dmesg "github.com/leptonai/gpud/pkg/dmesg"
)

func New(ctx context.Context, cfg nvidia_common.Config, db *sql.DB) components.Component {
const DefaultRetentionPeriod = 3 * 24 * time.Hour

func New(ctx context.Context, cfg nvidia_common.Config, dbRW *sql.DB, dbRO *sql.DB) components.Component {
cfg.Query.SetDefaultsIfNotSet()
setDefaultPoller(cfg)

cctx, ccancel := context.WithCancel(ctx)
getDefaultPoller().Start(cctx, cfg.Query, nvidia_component_error_xid_id.Name)

setHealthyCh := make(chan struct{})
localStore, err := store.New(ctx, db, "components_accelerator_nvidia_error_xid_events")
localStore, err := db.NewStore(dbRW, dbRO, "components_accelerator_nvidia_error_xid_events", DefaultRetentionPeriod)
if err != nil {
log.Logger.Errorw("failed to create store", "error", err)
ccancel()
Expand Down Expand Up @@ -59,7 +61,7 @@ type XIDComponent struct {
poller query.Poller
currState components.State
setHealthyCh chan struct{}
store *store.Store
store db.Store
}

func (c *XIDComponent) Name() string { return nvidia_component_error_xid_id.Name }
Expand Down Expand Up @@ -93,7 +95,7 @@ func (c *XIDComponent) Start() error {
time.Sleep(1 * time.Second)
continue
}
localEvents, err := c.store.GetAllEvents(c.rootCtx)
localEvents, err := c.store.Get(c.rootCtx, time.Time{})
if err != nil {
log.Logger.Errorw("failed to get all events", "error", err)
time.Sleep(1 * time.Second)
Expand All @@ -109,15 +111,11 @@ func (c *XIDComponent) Start() error {
case <-c.rootCtx.Done():
return
case <-c.setHealthyCh:
count, err := c.store.CreateEvent(c.rootCtx, components.Event{Time: metav1.Time{Time: time.Now().UTC()}, Name: "SetHealthy"})
if err != nil {
if err = c.store.Insert(c.rootCtx, components.Event{Time: metav1.Time{Time: time.Now().UTC()}, Name: "SetHealthy"}); err != nil {
log.Logger.Errorw("failed to create event", "error", err)
continue
} else if count == 0 {
log.Logger.Debugw("no new events created")
continue
}
events, err := c.store.GetAllEvents(c.rootCtx)
events, err := c.store.Get(c.rootCtx, time.Time{})
if err != nil {
log.Logger.Errorw("failed to get all events", "error", err)
continue
Expand All @@ -142,15 +140,20 @@ func (c *XIDComponent) Start() error {
EventKeyDeviceUUID: ev.DeviceUUID,
},
}
count, err := c.store.CreateEvent(c.rootCtx, event)
currEvent, err := c.store.Find(c.rootCtx, event)
if err != nil {
log.Logger.Errorw("failed to create event", "error", err)
continue
} else if count == 0 {
}
if currEvent != nil {
log.Logger.Debugw("no new events created")
continue
}
events, err := c.store.GetAllEvents(c.rootCtx)
if err = c.store.Insert(c.rootCtx, event); err != nil {
log.Logger.Errorw("failed to create event", "error", err)
continue
}
events, err := c.store.Get(c.rootCtx, time.Time{})
if err != nil {
log.Logger.Errorw("failed to get all events", "error", err)
continue
Expand All @@ -168,7 +171,7 @@ func (c *XIDComponent) States(_ context.Context) ([]components.State, error) {

func (c *XIDComponent) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
var ret []components.Event
events, err := c.store.GetEvents(ctx, since)
events, err := c.store.Get(ctx, since)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -203,7 +206,7 @@ func mergeEvents(a, b []components.Event) []components.Event {
result = append(result, b...)

sort.Slice(result, func(i, j int) bool {
return result[i].Time.Time.Before(result[j].Time.Time)
return result[i].Time.Time.After(result[j].Time.Time)
})

return result
Expand Down
8 changes: 4 additions & 4 deletions components/accelerator/nvidia/error/xid/component_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func TestMergeEvents(t *testing.T) {
assert.Equal(t, tt.expected, len(result))
if len(result) > 1 {
for i := 1; i < len(result); i++ {
assert.True(t, result[i-1].Time.Time.Before(result[i].Time.Time) ||
assert.True(t, result[i-1].Time.Time.After(result[i].Time.Time) ||
result[i-1].Time.Time.Equal(result[i].Time.Time),
"events should be sorted by timestamp")
}
Expand All @@ -95,10 +95,10 @@ func TestMergeEvents(t *testing.T) {
result := mergeEvents(a, b)
assert.Len(t, result, 4)
expectedTimes := []time.Time{
now.Add(-2 * time.Hour),
now.Add(-1 * time.Hour),
now,
now.Add(2 * time.Hour),
now,
now.Add(-1 * time.Hour),
now.Add(-2 * time.Hour),
}
for i, expectedTime := range expectedTimes {
assert.Equal(t, expectedTime.Unix(), result[i].Time.Unix(),
Expand Down
26 changes: 20 additions & 6 deletions components/accelerator/nvidia/error/xid/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,26 @@ const rebootThreshold = 2

// EvolveHealthyState resolves the state of the XID error component.
// note: assume events are sorted by time in ascending order
func EvolveHealthyState(events []components.Event) components.State {
func EvolveHealthyState(events []components.Event) (ret components.State) {
defer func() {
log.Logger.Debugf("EvolveHealthyState: %v", ret)
}()
var lastSuggestedAction *common.SuggestedActions
var lastXidErr *XidError
lastHealth := StateHealthy
xidRebootMap := make(map[uint64]int)
for _, event := range events {
for i := len(events) - 1; i >= 0; i-- {
event := events[i]
if event.Name == EventNameErroXid {
event = resolveXIDEvent(event)
resolvedEvent := resolveXIDEvent(event)
var currXidErr XidError
if err := json.Unmarshal([]byte(event.ExtraInfo[EventKeyErroXidData]), &currXidErr); err != nil {
log.Logger.Errorf("failed to unmarshal event %s %s extra info: %s", event.Name, event.Message, err)
if err := json.Unmarshal([]byte(resolvedEvent.ExtraInfo[EventKeyErroXidData]), &currXidErr); err != nil {
log.Logger.Errorf("failed to unmarshal event %s %s extra info: %s", resolvedEvent.Name, resolvedEvent.Message, err)
continue
}

currEvent := StateHealthy
switch event.Type {
switch resolvedEvent.Type {
case common.EventTypeCritical:
currEvent = StateDegraded
case common.EventTypeFatal:
Expand Down Expand Up @@ -129,3 +133,13 @@ func resolveXIDEvent(event components.Event) components.Event {
}
return ret
}

func sortEvents(events []components.Event) {

Check failure on line 137 in components/accelerator/nvidia/error/xid/helper.go

View workflow job for this annotation

GitHub Actions / golangci-lint

func `sortEvents` is unused (unused)
for i := 0; i < len(events); i++ {
for j := i + 1; j < len(events); j++ {
if events[i].Time.After(events[j].Time.Time) {
events[i], events[j] = events[j], events[i]
}
}
}
}
4 changes: 2 additions & 2 deletions components/accelerator/nvidia/error/xid/helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ func TestStateUpdateBasedOnEvents(t *testing.T) {

t.Run("reboot recover", func(t *testing.T) {
events := []components.Event{
createXidEvent(789, common.EventTypeCritical, common.RepairActionTypeRebootSystem),
{Name: "reboot"},
createXidEvent(789, common.EventTypeCritical, common.RepairActionTypeRebootSystem),
}
state := EvolveHealthyState(events)
assert.True(t, state.Healthy)
Expand All @@ -79,8 +79,8 @@ func TestStateUpdateBasedOnEvents(t *testing.T) {

t.Run("SetHealthy", func(t *testing.T) {
events := []components.Event{
createXidEvent(789, common.EventTypeFatal, common.RepairActionTypeRebootSystem),
{Name: "SetHealthy"},
createXidEvent(789, common.EventTypeFatal, common.RepairActionTypeRebootSystem),
}
state := EvolveHealthyState(events)
assert.True(t, state.Healthy)
Expand Down
Loading

0 comments on commit f4a90c6

Please sign in to comment.