-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(nvidia/xid,sxid/dmesg): add dmesg log line matcher, xid/sxid ext…
…ractor (#333) For #321. --------- Signed-off-by: Gyuho Lee <[email protected]>
- Loading branch information
Showing
4 changed files
with
485 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
package dmesg | ||
|
||
import ( | ||
"regexp" | ||
"strconv" | ||
|
||
"github.com/leptonai/gpud/components/accelerator/nvidia/query/sxid" | ||
) | ||
|
||
const ( | ||
// e.g., | ||
// [111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First) | ||
// [131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up | ||
// | ||
// ref. | ||
// "D.4 Non-Fatal NVSwitch SXid Errors" | ||
// https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf | ||
RegexNVSwitchSXidDmesg = `SXid.*?: (\d+),` | ||
|
||
// Regex to extract PCI device ID from NVSwitch SXid messages | ||
RegexNVSwitchSXidDeviceUUID = `SXid \((PCI:[0-9a-fA-F:\.]+)\)` | ||
) | ||
|
||
var ( | ||
compiledRegexNVSwitchSXidDmesg = regexp.MustCompile(RegexNVSwitchSXidDmesg) | ||
compiledRegexNVSwitchSXidDeviceUUID = regexp.MustCompile(RegexNVSwitchSXidDeviceUUID) | ||
) | ||
|
||
// Extracts the nvidia NVSwitch SXid error code from the dmesg log line. | ||
// Returns 0 if the error code is not found. | ||
// https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf | ||
func ExtractNVSwitchSXid(line string) int { | ||
if match := compiledRegexNVSwitchSXidDmesg.FindStringSubmatch(line); match != nil { | ||
if id, err := strconv.Atoi(match[1]); err == nil { | ||
return id | ||
} | ||
} | ||
return 0 | ||
} | ||
|
||
// ExtractNVSwitchSXidDeviceUUID extracts the PCI device ID from the dmesg log line. | ||
// Returns empty string if the device ID is not found. | ||
func ExtractNVSwitchSXidDeviceUUID(line string) string { | ||
if match := compiledRegexNVSwitchSXidDeviceUUID.FindStringSubmatch(line); match != nil { | ||
return match[1] | ||
} | ||
return "" | ||
} | ||
|
||
type SXidError struct { | ||
SXid int `json:"sxid"` | ||
DeviceUUID string `json:"device_uuid"` | ||
Detail *sxid.Detail `json:"detail,omitempty"` | ||
} | ||
|
||
// Returns a matching xid error object if found. | ||
// Otherwise, returns nil. | ||
func Match(line string) *SXidError { | ||
extractedID := ExtractNVSwitchSXid(line) | ||
if extractedID == 0 { | ||
return nil | ||
} | ||
detail, ok := sxid.GetDetail(extractedID) | ||
if !ok { | ||
return nil | ||
} | ||
deviceUUID := ExtractNVSwitchSXidDeviceUUID(line) | ||
return &SXidError{ | ||
SXid: extractedID, | ||
DeviceUUID: deviceUUID, | ||
Detail: detail, | ||
} | ||
} |
168 changes: 168 additions & 0 deletions
168
components/accelerator/nvidia/query/sxid/dmesg/dmesg_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
package dmesg | ||
|
||
import "testing" | ||
|
||
func TestExtractNVSwitchSXid(t *testing.T) { | ||
t.Parallel() | ||
|
||
tests := []struct { | ||
name string | ||
input string | ||
expected int | ||
}{ | ||
{ | ||
name: "valid NVSwitch SXid error", | ||
input: "[111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)", | ||
expected: 12028, | ||
}, | ||
{ | ||
name: "another valid NVSwitch SXid error", | ||
input: "[131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up", | ||
expected: 20034, | ||
}, | ||
{ | ||
name: "NVSwitch SXid error without timestamp", | ||
input: "nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error", | ||
expected: 12028, | ||
}, | ||
{ | ||
name: "no match", | ||
input: "Regular log content without SXid errors", | ||
expected: 0, | ||
}, | ||
{ | ||
name: "NVSwitch SXid with non-numeric value", | ||
input: "nvidia-nvswitch0: SXid (PCI:0000:00:00.0): xyz, Fatal error", | ||
expected: 0, | ||
}, | ||
{ | ||
name: "NVSwitch SXid with data payload", | ||
input: "[131453.740758] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 20034, Data {0x50610002, 0x10100030}", | ||
expected: 20034, | ||
}, | ||
{ | ||
name: "NVSwitch SXid with unknown code", | ||
input: "[131453.740758] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 11111, Data {0x50610002, 0x10100030}", | ||
expected: 11111, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
result := ExtractNVSwitchSXid(tt.input) | ||
if result != tt.expected { | ||
t.Errorf("ExtractNVSwitchSXid(%q) = %d, want %d", tt.input, result, tt.expected) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestExtractNVSwitchSXidDeviceUUID(t *testing.T) { | ||
t.Parallel() | ||
|
||
tests := []struct { | ||
name string | ||
input string | ||
expected string | ||
}{ | ||
{ | ||
name: "valid device ID with timestamp", | ||
input: "[111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error", | ||
expected: "PCI:0000:05:00.0", | ||
}, | ||
{ | ||
name: "valid device ID without timestamp", | ||
input: "nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up", | ||
expected: "PCI:0000:00:00.0", | ||
}, | ||
{ | ||
name: "no device ID", | ||
input: "Regular log content without SXid", | ||
expected: "", | ||
}, | ||
{ | ||
name: "malformed device ID", | ||
input: "nvidia-nvswitch0: SXid (invalid): some error", | ||
expected: "", | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
result := ExtractNVSwitchSXidDeviceUUID(tt.input) | ||
if result != tt.expected { | ||
t.Errorf("ExtractNVSwitchSXidDeviceUUID(%q) = %q, want %q", tt.input, result, tt.expected) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestMatch(t *testing.T) { | ||
t.Parallel() | ||
|
||
tests := []struct { | ||
name string | ||
input string | ||
expectNil bool | ||
expectedSXid int | ||
expectedDevice string | ||
}{ | ||
{ | ||
name: "valid NVSwitch SXid error", | ||
input: "[111111111.111] nvidia-nvswitch3: SXid (PCI:0000:05:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error", | ||
expectNil: false, | ||
expectedSXid: 12028, | ||
expectedDevice: "PCI:0000:05:00.0", | ||
}, | ||
{ | ||
name: "another valid NVSwitch SXid error", | ||
input: "[131453.740743] nvidia-nvswitch0: SXid (PCI:0000:00:00.0): 20034, Fatal, Link 30 LTSSM Fault Up", | ||
expectNil: false, | ||
expectedSXid: 20034, | ||
expectedDevice: "PCI:0000:00:00.0", | ||
}, | ||
{ | ||
name: "no SXid error", | ||
input: "Regular log content without SXid errors", | ||
expectNil: true, | ||
}, | ||
{ | ||
name: "invalid SXid number", | ||
input: "nvidia-nvswitch0: SXid (PCI:0000:00:00.0): xyz, Fatal error", | ||
expectNil: true, | ||
}, | ||
{ | ||
name: "unknown SXid code", | ||
input: "[131453.740758] nvidia-nvswitch0: SXid (PCI:0000:a9:00.0): 11111, Data {0x50610002, 0x10100030}", | ||
expectNil: true, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
result := Match(tt.input) | ||
if tt.expectNil { | ||
if result != nil { | ||
t.Errorf("Match(%q) = %+v, want nil", tt.input, result) | ||
} | ||
return | ||
} | ||
|
||
if result == nil { | ||
t.Fatalf("Match(%q) = nil, want non-nil", tt.input) | ||
} | ||
|
||
if result.SXid != tt.expectedSXid { | ||
t.Errorf("Match(%q).SXid = %d, want %d", tt.input, result.SXid, tt.expectedSXid) | ||
} | ||
|
||
if result.DeviceUUID != tt.expectedDevice { | ||
t.Errorf("Match(%q).DeviceUUID = %q, want %q", tt.input, result.DeviceUUID, tt.expectedDevice) | ||
} | ||
|
||
if result.Detail == nil { | ||
t.Errorf("Match(%q).Detail = nil, want non-nil", tt.input) | ||
} | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
package dmesg | ||
|
||
import ( | ||
"regexp" | ||
"strconv" | ||
|
||
"github.com/leptonai/gpud/components/accelerator/nvidia/query/xid" | ||
) | ||
|
||
const ( | ||
// e.g., | ||
// [...] NVRM: Xid (0000:03:00): 14, Channel 00000001 | ||
// [...] NVRM: Xid (PCI:0000:05:00): 79, pid='<unknown>', name=<unknown>, GPU has fallen off the bus. | ||
// NVRM: Xid (PCI:0000:01:00): 79, GPU has fallen off the bus. | ||
// | ||
// ref. | ||
// https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf | ||
RegexNVRMXidDmesg = `NVRM: Xid.*?: (\d+),` | ||
|
||
// Regex to extract PCI device ID from NVRM Xid messages | ||
// Matches both formats: (0000:03:00) and (PCI:0000:05:00) | ||
RegexNVRMXidDeviceUUID = `NVRM: Xid \(((?:PCI:)?[0-9a-fA-F:]+)\)` | ||
) | ||
|
||
var ( | ||
compiledRegexNVRMXidDmesg = regexp.MustCompile(RegexNVRMXidDmesg) | ||
compiledRegexNVRMXidDeviceUUID = regexp.MustCompile(RegexNVRMXidDeviceUUID) | ||
) | ||
|
||
// Extracts the nvidia Xid error code from the dmesg log line. | ||
// Returns 0 if the error code is not found. | ||
// https://docs.nvidia.com/deploy/pdf/XID_Errors.pdf | ||
func ExtractNVRMXid(line string) int { | ||
if match := compiledRegexNVRMXidDmesg.FindStringSubmatch(line); match != nil { | ||
if id, err := strconv.Atoi(match[1]); err == nil { | ||
return id | ||
} | ||
} | ||
return 0 | ||
} | ||
|
||
// ExtractNVRMXidDeviceUUID extracts the PCI device ID from the NVRM Xid dmesg log line. | ||
// For input without "PCI:" prefix, it returns the ID as is. | ||
// For input with "PCI:" prefix, it returns the full ID including the prefix. | ||
// Returns empty string if the device ID is not found. | ||
func ExtractNVRMXidDeviceUUID(line string) string { | ||
if match := compiledRegexNVRMXidDeviceUUID.FindStringSubmatch(line); match != nil { | ||
return match[1] | ||
} | ||
return "" | ||
} | ||
|
||
type XidError struct { | ||
Xid int `json:"xid"` | ||
DeviceUUID string `json:"device_uuid"` | ||
Detail *xid.Detail `json:"detail,omitempty"` | ||
} | ||
|
||
// Returns a matching xid error object if found. | ||
// Otherwise, returns nil. | ||
func Match(line string) *XidError { | ||
extractedID := ExtractNVRMXid(line) | ||
if extractedID == 0 { | ||
return nil | ||
} | ||
detail, ok := xid.GetDetail(extractedID) | ||
if !ok { | ||
return nil | ||
} | ||
deviceUUID := ExtractNVRMXidDeviceUUID(line) | ||
return &XidError{ | ||
Xid: extractedID, | ||
DeviceUUID: deviceUUID, | ||
Detail: detail, | ||
} | ||
} |
Oops, something went wrong.