Skip to content

Commit

Permalink
feat(nvidia/xid,sxid/dmesg): add dmesg log line matcher, xid/sxid ext…
Browse files Browse the repository at this point in the history
…ractor (#333)

For #321.

---------

Signed-off-by: Gyuho Lee <[email protected]>
  • Loading branch information
gyuho authored Jan 25, 2025
1 parent d24c32c commit 02d1814
Show file tree
Hide file tree
Showing 4 changed files with 485 additions and 0 deletions.
73 changes: 73 additions & 0 deletions components/accelerator/nvidia/query/sxid/dmesg/dmesg.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package dmesg

import (
"regexp"
"strconv"

"github.com/leptonai/gpud/components/accelerator/nvidia/query/sxid"
)

const (
// e.g.,
// [111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)
// [131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up
//
// ref.
// "D.4 Non-Fatal NVSwitch SXid Errors"
// https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf
RegexNVSwitchSXidDmesg = `SXid.*?: (\d+),`

// Regex to extract PCI device ID from NVSwitch SXid messages
RegexNVSwitchSXidDeviceUUID = `SXid \((PCI:[0-9a-fA-F:\.]+)\)`
)

var (
compiledRegexNVSwitchSXidDmesg = regexp.MustCompile(RegexNVSwitchSXidDmesg)
compiledRegexNVSwitchSXidDeviceUUID = regexp.MustCompile(RegexNVSwitchSXidDeviceUUID)
)

// Extracts the nvidia NVSwitch SXid error code from the dmesg log line.
// Returns 0 if the error code is not found.
// https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf
func ExtractNVSwitchSXid(line string) int {
if match := compiledRegexNVSwitchSXidDmesg.FindStringSubmatch(line); match != nil {
if id, err := strconv.Atoi(match[1]); err == nil {
return id
}
}
return 0
}

// ExtractNVSwitchSXidDeviceUUID extracts the PCI device ID from the dmesg log line.
// Returns empty string if the device ID is not found.
func ExtractNVSwitchSXidDeviceUUID(line string) string {
if match := compiledRegexNVSwitchSXidDeviceUUID.FindStringSubmatch(line); match != nil {
return match[1]
}
return ""
}

type SXidError struct {
SXid int `json:"sxid"`
DeviceUUID string `json:"device_uuid"`
Detail *sxid.Detail `json:"detail,omitempty"`
}

// Returns a matching xid error object if found.
// Otherwise, returns nil.
func Match(line string) *SXidError {
extractedID := ExtractNVSwitchSXid(line)
if extractedID == 0 {
return nil
}
detail, ok := sxid.GetDetail(extractedID)
if !ok {
return nil
}
deviceUUID := ExtractNVSwitchSXidDeviceUUID(line)
return &SXidError{
SXid: extractedID,
DeviceUUID: deviceUUID,
Detail: detail,
}
}
168 changes: 168 additions & 0 deletions components/accelerator/nvidia/query/sxid/dmesg/dmesg_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
package dmesg

import "testing"

func TestExtractNVSwitchSXid(t *testing.T) {
t.Parallel()

tests := []struct {
name string
input string
expected int
}{
{
name: "valid NVSwitch SXid error",
input: "[111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)",
expected: 12028,
},
{
name: "another valid NVSwitch SXid error",
input: "[131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up",
expected: 20034,
},
{
name: "NVSwitch SXid error without timestamp",
input: "nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error",
expected: 12028,
},
{
name: "no match",
input: "Regular log content without SXid errors",
expected: 0,
},
{
name: "NVSwitch SXid with non-numeric value",
input: "nvidia-nvswitch0: SXid (PCI:0000:00:00.0): xyz, Fatal error",
expected: 0,
},
{
name: "NVSwitch SXid with data payload",
input: "[131453.740758] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 20034, Data {0x50610002, 0x10100030}",
expected: 20034,
},
{
name: "NVSwitch SXid with unknown code",
input: "[131453.740758] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 11111, Data {0x50610002, 0x10100030}",
expected: 11111,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := ExtractNVSwitchSXid(tt.input)
if result != tt.expected {
t.Errorf("ExtractNVSwitchSXid(%q) = %d, want %d", tt.input, result, tt.expected)
}
})
}
}

func TestExtractNVSwitchSXidDeviceUUID(t *testing.T) {
t.Parallel()

tests := []struct {
name string
input string
expected string
}{
{
name: "valid device ID with timestamp",
input: "[111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error",
expected: "PCI:0000:05:00.0",
},
{
name: "valid device ID without timestamp",
input: "nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up",
expected: "PCI:0000:00:00.0",
},
{
name: "no device ID",
input: "Regular log content without SXid",
expected: "",
},
{
name: "malformed device ID",
input: "nvidia-nvswitch0: SXid (invalid): some error",
expected: "",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := ExtractNVSwitchSXidDeviceUUID(tt.input)
if result != tt.expected {
t.Errorf("ExtractNVSwitchSXidDeviceUUID(%q) = %q, want %q", tt.input, result, tt.expected)
}
})
}
}

func TestMatch(t *testing.T) {
t.Parallel()

tests := []struct {
name string
input string
expectNil bool
expectedSXid int
expectedDevice string
}{
{
name: "valid NVSwitch SXid error",
input: "[111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error",
expectNil: false,
expectedSXid: 12028,
expectedDevice: "PCI:0000:05:00.0",
},
{
name: "another valid NVSwitch SXid error",
input: "[131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up",
expectNil: false,
expectedSXid: 20034,
expectedDevice: "PCI:0000:00:00.0",
},
{
name: "no SXid error",
input: "Regular log content without SXid errors",
expectNil: true,
},
{
name: "invalid SXid number",
input: "nvidia-nvswitch0: SXid (PCI:0000:00:00.0): xyz, Fatal error",
expectNil: true,
},
{
name: "unknown SXid code",
input: "[131453.740758] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 11111, Data {0x50610002, 0x10100030}",
expectNil: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := Match(tt.input)
if tt.expectNil {
if result != nil {
t.Errorf("Match(%q) = %+v, want nil", tt.input, result)
}
return
}

if result == nil {
t.Fatalf("Match(%q) = nil, want non-nil", tt.input)
}

if result.SXid != tt.expectedSXid {
t.Errorf("Match(%q).SXid = %d, want %d", tt.input, result.SXid, tt.expectedSXid)
}

if result.DeviceUUID != tt.expectedDevice {
t.Errorf("Match(%q).DeviceUUID = %q, want %q", tt.input, result.DeviceUUID, tt.expectedDevice)
}

if result.Detail == nil {
t.Errorf("Match(%q).Detail = nil, want non-nil", tt.input)
}
})
}
}
76 changes: 76 additions & 0 deletions components/accelerator/nvidia/query/xid/dmesg/dmesg.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package dmesg

import (
"regexp"
"strconv"

"github.com/leptonai/gpud/components/accelerator/nvidia/query/xid"
)

const (
// e.g.,
// [...] NVRM: Xid (0000:03:00): 14, Channel 00000001
// [...] NVRM: Xid (PCI:0000:05:00): 79, pid='<unknown>', name=<unknown>, GPU has fallen off the bus.
// NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus.
//
// ref.
// https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf
RegexNVRMXidDmesg = `NVRM: Xid.*?: (\d+),`

// Regex to extract PCI device ID from NVRM Xid messages
// Matches both formats: (0000:03:00) and (PCI:0000:05:00)
RegexNVRMXidDeviceUUID = `NVRM: Xid \(((?:PCI:)?[0-9a-fA-F:]+)\)`
)

var (
compiledRegexNVRMXidDmesg = regexp.MustCompile(RegexNVRMXidDmesg)
compiledRegexNVRMXidDeviceUUID = regexp.MustCompile(RegexNVRMXidDeviceUUID)
)

// Extracts the nvidia Xid error code from the dmesg log line.
// Returns 0 if the error code is not found.
// https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf
func ExtractNVRMXid(line string) int {
if match := compiledRegexNVRMXidDmesg.FindStringSubmatch(line); match != nil {
if id, err := strconv.Atoi(match[1]); err == nil {
return id
}
}
return 0
}

// ExtractNVRMXidDeviceUUID extracts the PCI device ID from the NVRM Xid dmesg log line.
// For input without "PCI:" prefix, it returns the ID as is.
// For input with "PCI:" prefix, it returns the full ID including the prefix.
// Returns empty string if the device ID is not found.
func ExtractNVRMXidDeviceUUID(line string) string {
if match := compiledRegexNVRMXidDeviceUUID.FindStringSubmatch(line); match != nil {
return match[1]
}
return ""
}

type XidError struct {
Xid int `json:"xid"`
DeviceUUID string `json:"device_uuid"`
Detail *xid.Detail `json:"detail,omitempty"`
}

// Returns a matching xid error object if found.
// Otherwise, returns nil.
func Match(line string) *XidError {
extractedID := ExtractNVRMXid(line)
if extractedID == 0 {
return nil
}
detail, ok := xid.GetDetail(extractedID)
if !ok {
return nil
}
deviceUUID := ExtractNVRMXidDeviceUUID(line)
return &XidError{
Xid: extractedID,
DeviceUUID: deviceUUID,
Detail: detail,
}
}
Loading

0 comments on commit 02d1814

Please sign in to comment.