diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ed629a5c..f5c5ad51 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,7 +13,7 @@ jobs: run: | make binary-frontend-test-coverage - name: Upload coverage - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: coverage path: ${{ github.workspace }}/webapp/frontend/coverage/lcov.info @@ -49,7 +49,7 @@ jobs: run: | make binary-clean binary-test-coverage - name: Upload coverage - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: coverage path: ${{ github.workspace }}/coverage.txt @@ -64,7 +64,7 @@ jobs: - name: Checkout uses: actions/checkout@v2 - name: Download coverage reports - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: coverage - name: Upload coverage reports @@ -106,7 +106,7 @@ jobs: run: | make binary-clean binary-all - name: Archive - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: binaries.zip path: | diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 39778184..5ccfa70f 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -61,7 +61,7 @@ jobs: with: version_metadata_path: ${{ github.event.inputs.version_metadata_path }} - name: Upload workspace - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: workspace path: ${{ github.workspace }}/**/* @@ -91,7 +91,7 @@ jobs: - { on: windows-latest, goos: windows, goarch: arm64 } steps: - name: Download workspace - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: workspace - uses: actions/setup-go@v3 @@ -101,7 +101,7 @@ jobs: run: | make binary-clean binary-all - name: Archive - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: binaries.zip path: | @@ -114,11 +114,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Download workspace - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: workspace - name: Download binaries - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: binaries.zip - name: List diff --git a/collector/pkg/collector/metrics.go b/collector/pkg/collector/metrics.go index 5d453ddf..10bb98d6 100644 --- a/collector/pkg/collector/metrics.go +++ b/collector/pkg/collector/metrics.go @@ -4,6 +4,12 @@ import ( "bytes" "encoding/json" "fmt" + "net/url" + "os" + "os/exec" + "strings" + "time" + "github.com/analogj/scrutiny/collector/pkg/common/shell" "github.com/analogj/scrutiny/collector/pkg/config" "github.com/analogj/scrutiny/collector/pkg/detect" @@ -11,10 +17,6 @@ import ( "github.com/analogj/scrutiny/collector/pkg/models" "github.com/samber/lo" "github.com/sirupsen/logrus" - "net/url" - "os" - "os/exec" - "strings" ) type MetricsCollector struct { @@ -90,8 +92,9 @@ func (mc *MetricsCollector) Run() error { //go mc.Collect(&wg, device.WWN, device.DeviceName, device.DeviceType) mc.Collect(device.WWN, device.DeviceName, device.DeviceType) - // TODO: we may need to sleep for between each call to smartctl -a - //time.Sleep(30 * time.Millisecond) + if mc.config.GetInt("commands.metrics_smartctl_wait") > 0 { + time.Sleep(time.Duration(mc.config.GetInt("commands.metrics_smartctl_wait")) * time.Second) + } } //mc.logger.Infoln("Main: Waiting for workers to finish") @@ -113,7 +116,7 @@ func (mc *MetricsCollector) Validate() error { return nil } -//func (mc *MetricsCollector) Collect(wg *sync.WaitGroup, deviceWWN string, deviceName string, deviceType string) { +// func (mc *MetricsCollector) Collect(wg *sync.WaitGroup, deviceWWN string, deviceName string, deviceType string) { func (mc *MetricsCollector) Collect(deviceWWN string, deviceName string, deviceType string) { //defer wg.Done() if len(deviceWWN) == 0 { diff --git a/collector/pkg/config/config.go b/collector/pkg/config/config.go index 0a0d156f..3e81880d 100644 --- a/collector/pkg/config/config.go +++ b/collector/pkg/config/config.go @@ -20,7 +20,7 @@ import ( type configuration struct { *viper.Viper - deviceOverrides []models.ScanOverride + deviceOverrides []models.ScanOverride } //Viper uses the following precedence order. Each item takes precedence over the item below it: @@ -47,9 +47,12 @@ func (c *configuration) Init() error { c.SetDefault("commands.metrics_scan_args", "--scan --json") c.SetDefault("commands.metrics_info_args", "--info --json") c.SetDefault("commands.metrics_smart_args", "--xall --json") + c.SetDefault("commands.metrics_smartctl_wait", 0) //c.SetDefault("collect.short.command", "-a -o on -S on") + c.SetDefault("allow_listed_devices", []string{}) + //if you want to load a non-standard location system config file (~/drawbridge.yml), use ReadConfig c.SetConfigType("yaml") //c.SetConfigName("drawbridge") @@ -186,3 +189,18 @@ func (c *configuration) GetCommandMetricsSmartArgs(deviceName string) string { } return c.GetString("commands.metrics_smart_args") } + +func (c *configuration) IsAllowlistedDevice(deviceName string) bool { + allowList := c.GetStringSlice("allow_listed_devices") + if len(allowList) == 0 { + return true + } + + for _, item := range allowList { + if item == deviceName { + return true + } + } + + return false +} diff --git a/collector/pkg/config/config_test.go b/collector/pkg/config/config_test.go index f123131b..3a262e91 100644 --- a/collector/pkg/config/config_test.go +++ b/collector/pkg/config/config_test.go @@ -144,3 +144,29 @@ func TestConfiguration_OverrideDeviceCommands_MetricsInfoArgs(t *testing.T) { require.Equal(t, "--info --json", testConfig.GetCommandMetricsInfoArgs("/dev/sdb")) //require.Equal(t, []models.ScanOverride{{Device: "/dev/sda", DeviceType: nil, Commands: {MetricsInfoArgs: "--info --json -T "}}}, scanOverrides) } + +func TestConfiguration_DeviceAllowList(t *testing.T) { + t.Parallel() + + t.Run("present", func(t *testing.T) { + testConfig, err := config.Create() + require.NoError(t, err) + + require.NoError(t, testConfig.ReadConfig(path.Join("testdata", "allow_listed_devices_present.yaml"))) + + require.True(t, testConfig.IsAllowlistedDevice("/dev/sda"), "/dev/sda should be allow listed") + require.False(t, testConfig.IsAllowlistedDevice("/dev/sdc"), "/dev/sda should not be allow listed") + }) + + t.Run("missing", func(t *testing.T) { + testConfig, err := config.Create() + require.NoError(t, err) + + // Really just any other config where the key is full missing + require.NoError(t, testConfig.ReadConfig(path.Join("testdata", "override_device_commands.yaml"))) + + // Anything should be allow listed if the key isnt there + require.True(t, testConfig.IsAllowlistedDevice("/dev/sda"), "/dev/sda should be allow listed") + require.True(t, testConfig.IsAllowlistedDevice("/dev/sdc"), "/dev/sda should be allow listed") + }) +} diff --git a/collector/pkg/config/interface.go b/collector/pkg/config/interface.go index e810cec4..68bf9bc3 100644 --- a/collector/pkg/config/interface.go +++ b/collector/pkg/config/interface.go @@ -25,4 +25,6 @@ type Interface interface { GetDeviceOverrides() []models.ScanOverride GetCommandMetricsInfoArgs(deviceName string) string GetCommandMetricsSmartArgs(deviceName string) string + + IsAllowlistedDevice(deviceName string) bool } diff --git a/collector/pkg/config/mock/mock_config.go b/collector/pkg/config/mock/mock_config.go index 1b135b62..98a19bb6 100644 --- a/collector/pkg/config/mock/mock_config.go +++ b/collector/pkg/config/mock/mock_config.go @@ -175,6 +175,20 @@ func (mr *MockInterfaceMockRecorder) Init() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Init", reflect.TypeOf((*MockInterface)(nil).Init)) } +// IsAllowlistedDevice mocks base method. +func (m *MockInterface) IsAllowlistedDevice(deviceName string) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsAllowlistedDevice", deviceName) + ret0, _ := ret[0].(bool) + return ret0 +} + +// IsAllowlistedDevice indicates an expected call of IsAllowlistedDevice. +func (mr *MockInterfaceMockRecorder) IsAllowlistedDevice(deviceName interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsAllowlistedDevice", reflect.TypeOf((*MockInterface)(nil).IsAllowlistedDevice), deviceName) +} + // IsSet mocks base method. func (m *MockInterface) IsSet(key string) bool { m.ctrl.T.Helper() diff --git a/collector/pkg/config/testdata/allow_listed_devices_present.yaml b/collector/pkg/config/testdata/allow_listed_devices_present.yaml new file mode 100644 index 00000000..44b3b182 --- /dev/null +++ b/collector/pkg/config/testdata/allow_listed_devices_present.yaml @@ -0,0 +1,3 @@ +allow_listed_devices: +- /dev/sda +- /dev/sdb diff --git a/collector/pkg/detect/detect.go b/collector/pkg/detect/detect.go index 529ee3ea..d64a725c 100644 --- a/collector/pkg/detect/detect.go +++ b/collector/pkg/detect/detect.go @@ -124,6 +124,11 @@ func (d *Detect) TransformDetectedDevices(detectedDeviceConns models.Scan) []mod deviceFile := strings.ToLower(scannedDevice.Name) + // If the user has defined a device allow list, and this device isnt there, then ignore it + if !d.Config.IsAllowlistedDevice(deviceFile) { + continue + } + detectedDevice := models.Device{ HostId: d.Config.GetString("host.id"), DeviceType: scannedDevice.Type, diff --git a/collector/pkg/detect/detect_test.go b/collector/pkg/detect/detect_test.go index c2976f5c..cf64155c 100644 --- a/collector/pkg/detect/detect_test.go +++ b/collector/pkg/detect/detect_test.go @@ -24,6 +24,7 @@ func TestDetect_SmartctlScan(t *testing.T) { fakeConfig.EXPECT().GetDeviceOverrides().AnyTimes().Return([]models.ScanOverride{}) fakeConfig.EXPECT().GetString("commands.metrics_smartctl_bin").AnyTimes().Return("smartctl") fakeConfig.EXPECT().GetString("commands.metrics_scan_args").AnyTimes().Return("--scan --json") + fakeConfig.EXPECT().IsAllowlistedDevice(gomock.Any()).AnyTimes().Return(true) fakeShell := mock_shell.NewMockInterface(mockCtrl) testScanResults, err := os.ReadFile("testdata/smartctl_scan_simple.json") @@ -53,6 +54,7 @@ func TestDetect_SmartctlScan_Megaraid(t *testing.T) { fakeConfig.EXPECT().GetDeviceOverrides().AnyTimes().Return([]models.ScanOverride{}) fakeConfig.EXPECT().GetString("commands.metrics_smartctl_bin").AnyTimes().Return("smartctl") fakeConfig.EXPECT().GetString("commands.metrics_scan_args").AnyTimes().Return("--scan --json") + fakeConfig.EXPECT().IsAllowlistedDevice(gomock.Any()).AnyTimes().Return(true) fakeShell := mock_shell.NewMockInterface(mockCtrl) testScanResults, err := os.ReadFile("testdata/smartctl_scan_megaraid.json") @@ -85,6 +87,7 @@ func TestDetect_SmartctlScan_Nvme(t *testing.T) { fakeConfig.EXPECT().GetDeviceOverrides().AnyTimes().Return([]models.ScanOverride{}) fakeConfig.EXPECT().GetString("commands.metrics_smartctl_bin").AnyTimes().Return("smartctl") fakeConfig.EXPECT().GetString("commands.metrics_scan_args").AnyTimes().Return("--scan --json") + fakeConfig.EXPECT().IsAllowlistedDevice(gomock.Any()).AnyTimes().Return(true) fakeShell := mock_shell.NewMockInterface(mockCtrl) testScanResults, err := os.ReadFile("testdata/smartctl_scan_nvme.json") @@ -116,6 +119,7 @@ func TestDetect_TransformDetectedDevices_Empty(t *testing.T) { fakeConfig.EXPECT().GetDeviceOverrides().AnyTimes().Return([]models.ScanOverride{}) fakeConfig.EXPECT().GetString("commands.metrics_smartctl_bin").AnyTimes().Return("smartctl") fakeConfig.EXPECT().GetString("commands.metrics_scan_args").AnyTimes().Return("--scan --json") + fakeConfig.EXPECT().IsAllowlistedDevice(gomock.Any()).AnyTimes().Return(true) detectedDevices := models.Scan{ Devices: []models.ScanDevice{ @@ -149,6 +153,7 @@ func TestDetect_TransformDetectedDevices_Ignore(t *testing.T) { fakeConfig.EXPECT().GetDeviceOverrides().AnyTimes().Return([]models.ScanOverride{{Device: "/dev/sda", DeviceType: nil, Ignore: true}}) fakeConfig.EXPECT().GetString("commands.metrics_smartctl_bin").AnyTimes().Return("smartctl") fakeConfig.EXPECT().GetString("commands.metrics_scan_args").AnyTimes().Return("--scan --json") + fakeConfig.EXPECT().IsAllowlistedDevice(gomock.Any()).AnyTimes().Return(true) detectedDevices := models.Scan{ Devices: []models.ScanDevice{ @@ -180,6 +185,7 @@ func TestDetect_TransformDetectedDevices_Raid(t *testing.T) { fakeConfig.EXPECT().GetString("host.id").AnyTimes().Return("") fakeConfig.EXPECT().GetString("commands.metrics_smartctl_bin").AnyTimes().Return("smartctl") fakeConfig.EXPECT().GetString("commands.metrics_scan_args").AnyTimes().Return("--scan --json") + fakeConfig.EXPECT().IsAllowlistedDevice(gomock.Any()).AnyTimes().Return(true) fakeConfig.EXPECT().GetDeviceOverrides().AnyTimes().Return([]models.ScanOverride{ { Device: "/dev/bus/0", @@ -223,6 +229,7 @@ func TestDetect_TransformDetectedDevices_Simple(t *testing.T) { fakeConfig.EXPECT().GetString("commands.metrics_smartctl_bin").AnyTimes().Return("smartctl") fakeConfig.EXPECT().GetString("commands.metrics_scan_args").AnyTimes().Return("--scan --json") fakeConfig.EXPECT().GetDeviceOverrides().AnyTimes().Return([]models.ScanOverride{{Device: "/dev/sda", DeviceType: []string{"sat+megaraid"}}}) + fakeConfig.EXPECT().IsAllowlistedDevice(gomock.Any()).AnyTimes().Return(true) detectedDevices := models.Scan{ Devices: []models.ScanDevice{ { @@ -256,6 +263,7 @@ func TestDetect_TransformDetectedDevices_WithoutDeviceTypeOverride(t *testing.T) fakeConfig.EXPECT().GetString("commands.metrics_smartctl_bin").AnyTimes().Return("smartctl") fakeConfig.EXPECT().GetString("commands.metrics_scan_args").AnyTimes().Return("--scan --json") fakeConfig.EXPECT().GetDeviceOverrides().AnyTimes().Return([]models.ScanOverride{{Device: "/dev/sda"}}) + fakeConfig.EXPECT().IsAllowlistedDevice(gomock.Any()).AnyTimes().Return(true) detectedDevices := models.Scan{ Devices: []models.ScanDevice{ { @@ -302,6 +310,46 @@ func TestDetect_TransformDetectedDevices_WhenDeviceNotDetected(t *testing.T) { require.Equal(t, "ata", transformedDevices[0].DeviceType) } +func TestDetect_TransformDetectedDevices_AllowListFilters(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + fakeConfig := mock_config.NewMockInterface(mockCtrl) + fakeConfig.EXPECT().GetString("host.id").AnyTimes().Return("") + fakeConfig.EXPECT().GetString("commands.metrics_smartctl_bin").AnyTimes().Return("smartctl") + fakeConfig.EXPECT().GetString("commands.metrics_scan_args").AnyTimes().Return("--scan --json") + fakeConfig.EXPECT().GetDeviceOverrides().AnyTimes().Return([]models.ScanOverride{{Device: "/dev/sda", DeviceType: []string{"sat+megaraid"}}}) + fakeConfig.EXPECT().IsAllowlistedDevice("/dev/sda").Return(true) + fakeConfig.EXPECT().IsAllowlistedDevice("/dev/sdb").Return(false) + detectedDevices := models.Scan{ + Devices: []models.ScanDevice{ + { + Name: "/dev/sda", + InfoName: "/dev/sda", + Protocol: "ata", + Type: "ata", + }, + { + Name: "/dev/sdb", + InfoName: "/dev/sdb", + Protocol: "ata", + Type: "ata", + }, + }, + } + + d := detect.Detect{ + Config: fakeConfig, + } + + // test + transformedDevices := d.TransformDetectedDevices(detectedDevices) + + // assert + require.Equal(t, 1, len(transformedDevices)) + require.Equal(t, "sda", transformedDevices[0].DeviceName) +} + func TestDetect_SmartCtlInfo(t *testing.T) { t.Run("should report nvme info", func(t *testing.T) { ctrl := gomock.NewController(t) diff --git a/docker/example.hubspoke.docker-compose.yml b/docker/example.hubspoke.docker-compose.yml index c5b523b0..f301d603 100644 --- a/docker/example.hubspoke.docker-compose.yml +++ b/docker/example.hubspoke.docker-compose.yml @@ -41,6 +41,9 @@ services: environment: COLLECTOR_API_ENDPOINT: 'http://web:8080' COLLECTOR_HOST_ID: 'scrutiny-collector-hostname' + # If true forces the collector to run on startup (cron will be started after the collector completes) + # see: https://github.com/AnalogJ/scrutiny/blob/master/docs/TROUBLESHOOTING_DEVICE_COLLECTOR.md#collector-trigger-on-startup + COLLECTOR_RUN_STARTUP: false depends_on: web: condition: service_healthy diff --git a/docs/TESTERS.md b/docs/TESTERS.md index 088beba6..71c194dd 100644 --- a/docs/TESTERS.md +++ b/docs/TESTERS.md @@ -8,7 +8,7 @@ Thankfully the following users have been gracious enough to test/validate Scruti | Architecture Name | Binaries | Docker | | --- | --- | --- | -| linux-amd64 | -- | @feroxy @rshxyz | +| linux-amd64 | @TizzAmmazz | @feroxy @rshxyz | | linux-arm-5 | -- | | | linux-arm-6 | -- | | | linux-arm-7 | @Zorlin | @martini1992 | @@ -17,4 +17,4 @@ Thankfully the following users have been gracious enough to test/validate Scruti | macos-amd64 | -- | -- | | macos-arm64 | -- | -- | | windows-amd64 | @gabrielv33 | -- | -| windows-arm64 | -- | -- | \ No newline at end of file +| windows-arm64 | -- | -- | diff --git a/docs/TROUBLESHOOTING_DOCKER.md b/docs/TROUBLESHOOTING_DOCKER.md index 3d49897a..9b3fc266 100644 --- a/docs/TROUBLESHOOTING_DOCKER.md +++ b/docs/TROUBLESHOOTING_DOCKER.md @@ -17,3 +17,11 @@ So changing from `master-omnibus -> latest` will be the same thing for all inten > NOTE: Previously, there was a `automated cron build` that ran on the `master` and `beta` branches. They used to trigger a `nightly` build, even if nothing has changed on the branch. This has a couple of benefits, but one is to ensure that there's no broken external dependencies in our (unchanged) code. This `nightly` build no longer updates the `master-omnibus` tag. + +# Running Docker `rootless` + +To avoid that the container(s) restart when you installed Docker as `rootless` you need to isssue the following commands to allow the session to stay alive even after you close your (SSH) sesssion: + +`sudo loginctl enable-linger $(whoami)` + +`systemctl --user enable docker` diff --git a/docs/TROUBLESHOOTING_INFLUXDB.md b/docs/TROUBLESHOOTING_INFLUXDB.md index c89cae9f..291814df 100644 --- a/docs/TROUBLESHOOTING_INFLUXDB.md +++ b/docs/TROUBLESHOOTING_INFLUXDB.md @@ -11,6 +11,32 @@ dependency. It's a dedicated timeseries database, as opposed to the general purp a bunch of testing and analysis before I made the change. With InfluxDB the memory footprint for Scrutiny (at idle) is ~ 100mb, which is still fairly reasonable. +### Data Size + +It's surprisingly easy to reach extremely large database sizes, if you don't use downsampling, or you downsample incorrectly. +The growth rate is pretty unintuitive -- see https://github.com/AnalogJ/scrutiny/issues/650#issuecomment-2365174940 + +> Fasten stores the SMART metrics in a timeseries database (InfluxDB), and automatically downsamples the data on a schedule. +> +> The expectation was that cron would run daily, and there would be: +> +> - 7 daily data points +> - 3 weekly data points +> - 11 monthly data points +> - and infinite yearly data points. +> +> These data points would be for each SMART metric, for each device. +> eg. in one year, (7+3+11)*80ish SMART attributes = 1680 datapoints for one device +> +> If you're running cron every 15 minutes, your browser will instead be attempting to display: +> +> - 96*7 daily data points +> - 3 weekly +> - 11 monthly +> +> so (96*7 + 3 + 11)*80 = 54,880 datapoints for each device 😭 + + ## Installation InfluxDB is a required dependency for Scrutiny v0.4.0+. diff --git a/example.collector.yaml b/example.collector.yaml index d1fc9f16..04696102 100644 --- a/example.collector.yaml +++ b/example.collector.yaml @@ -81,6 +81,7 @@ devices: # metrics_scan_args: '--scan --json' # used to detect devices # metrics_info_args: '--info --json' # used to determine device unique ID & register device with Scrutiny # metrics_smart_args: '--xall --json' # used to retrieve smart data for each device. +# metrics_smartctl_wait: 0 # time to wait in seconds between each disk's check ########################################################################################################################