From d7382f9c22b4acded9e507b90821119d9bb69a2e Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Mon, 1 Nov 2021 21:06:02 +0000 Subject: [PATCH 01/10] Add btrfs mountpoint labels and device error stats Makes ioctl syscalls to load the device error stats. Adds filesystem mountpoint labels to existing metrics for ease of use. Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 294 ++++++++++++++++++++++++++++++++-- collector/btrfs_linux_test.go | 2 +- 2 files changed, 282 insertions(+), 14 deletions(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index 01971c7c1a..d56acfc3d5 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -17,7 +17,11 @@ package collector import ( + "bytes" "fmt" + "os" + "syscall" + "unsafe" "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" @@ -52,16 +56,214 @@ func NewBtrfsCollector(logger log.Logger) (Collector, error) { func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error { stats, err := c.fs.Stats() if err != nil { - return fmt.Errorf("failed to retrieve Btrfs stats: %w", err) + return fmt.Errorf("failed to retrieve Btrfs stats from procfs: %w", err) + } + + ioctlStatsMap, err := c.getIoctlStats() + if err != nil { + return fmt.Errorf("failed to retrieve Btrfs stats with ioctl: %w", err) } for _, s := range stats { - c.updateBtrfsStats(ch, s) + // match up procfs and ioctl info by filesystem UUID + ioctlStats := ioctlStatsMap[s.UUID] + c.updateBtrfsStats(ch, s, ioctlStats) } return nil } +type ioctlFsDeviceStats struct { + path string + uuid string + + bytesUsed uint64 + totalBytes uint64 + + writeErrors uint64 + readErrors uint64 + flushErrors uint64 + corruptionErrors uint64 + generationErrors uint64 +} + +type ioctlFsStats struct { + uuid string + mountPoint string + devices []ioctlFsDeviceStats +} + +func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { + // Instead of introducing more ioctl calls to scan for all btrfs + // filesytems re-use our mount point utils to find known mounts + mountsList, err := mountPointDetails(c.logger) + if err != nil { + return nil, err + } + + btrfsMounts := []string{} + for _, mount := range mountsList { + if mount.fsType == "btrfs" { + btrfsMounts = append(btrfsMounts, mount.mountPoint) + } + } + + var statsMap = make(map[string]*ioctlFsStats, len(btrfsMounts)) + + for _, mountPoint := range btrfsMounts { + ioctlStats, err := c.getIoctlFsStats(mountPoint) + if err != nil { + return nil, err + } + statsMap[ioctlStats.uuid] = ioctlStats + } + + return statsMap, nil +} + +// Magic constants for ioctl +//nolint:revive +const ( + _BTRFS_IOC_FS_INFO = 0x8400941F + _BTRFS_IOC_DEV_INFO = 0xD000941E + _BTRFS_IOC_GET_DEV_STATS = 0x00c4089434 +) + +// Known/supported device stats fields +//nolint:revive +const ( + // direct indicators of I/O failures: + + _BTRFS_DEV_STAT_WRITE_ERRS = iota + _BTRFS_DEV_STAT_READ_ERRS + _BTRFS_DEV_STAT_FLUSH_ERRS + + // indirect indicators of I/O failures: + + _BTRFS_DEV_STAT_CORRUPTION_ERRS // checksum error, bytenr error or contents is illegal + _BTRFS_DEV_STAT_GENERATION_ERRS // an indication that blocks have not been written + + _BTRFS_DEV_STAT_VALUES_MAX // counter to indicate the number of known stats we support +) + +type _UuidBytes [16]byte + +func (id _UuidBytes) String() string { + return fmt.Sprintf("%x-%x-%x-%x-%x", id[0:4], id[4:6], id[6:8], id[8:10], id[10:]) +} + +//name matches linux struct +//nolint:revive +type btrfs_ioctl_fs_info_args struct { + maxID uint64 // out + numDevices uint64 // out + fsID _UuidBytes // out + nodeSize uint32 // out + sectorSize uint32 // out + cloneAlignment uint32 // out + _ [122*8 + 4]byte // pad to 1k +} + +//name matches linux struct +//nolint:revive +type btrfs_ioctl_dev_info_args struct { + deviceID uint64 // in/out + uuid _UuidBytes // in/out + bytesUsed uint64 // out + totalBytes uint64 // out + _ [379]uint64 // pad to 4k + path [1024]byte // out +} + +//name matches linux struct +//nolint:revive +type btrfs_ioctl_get_dev_stats struct { + deviceID uint64 // in + itemCount uint64 // in/out + flags uint64 // in/out + values [_BTRFS_DEV_STAT_VALUES_MAX]uint64 // out values + _ [128 - 2 - _BTRFS_DEV_STAT_VALUES_MAX]uint64 // pad to 1k +} + +func (c *btrfsCollector) getIoctlFsStats(mountPoint string) (*ioctlFsStats, error) { + fd, err := os.Open(mountPoint) + if err != nil { + return nil, err + } + + var fsInfo = btrfs_ioctl_fs_info_args{} + _, _, errno := syscall.Syscall( + syscall.SYS_IOCTL, + fd.Fd(), + _BTRFS_IOC_FS_INFO, + uintptr(unsafe.Pointer(&fsInfo))) + if errno != 0 { + return nil, err + } + + devices := make([]ioctlFsDeviceStats, 0, fsInfo.numDevices) + + var deviceInfo btrfs_ioctl_dev_info_args + var deviceStats btrfs_ioctl_get_dev_stats + + for i := uint64(0); i <= fsInfo.maxID; i++ { + deviceInfo = btrfs_ioctl_dev_info_args{ + deviceID: i, + } + deviceStats = btrfs_ioctl_get_dev_stats{ + deviceID: i, + itemCount: _BTRFS_DEV_STAT_VALUES_MAX, + } + + _, _, errno := syscall.Syscall( + syscall.SYS_IOCTL, + fd.Fd(), + uintptr(_BTRFS_IOC_DEV_INFO), + uintptr(unsafe.Pointer(&deviceInfo))) + + if errno == syscall.ENODEV { + // device IDs do not consistently start at 0, so we expect this + continue + } + if errno != 0 { + return nil, errno + } + + _, _, errno = syscall.Syscall( + syscall.SYS_IOCTL, + fd.Fd(), + uintptr(_BTRFS_IOC_GET_DEV_STATS), + uintptr(unsafe.Pointer(&deviceStats))) + + if errno != 0 { + return nil, errno + } + + devices = append(devices, ioctlFsDeviceStats{ + path: string(bytes.Trim(deviceInfo.path[:], "\x00")), + uuid: deviceInfo.uuid.String(), + bytesUsed: deviceInfo.bytesUsed, + totalBytes: deviceInfo.totalBytes, + + writeErrors: deviceStats.values[_BTRFS_DEV_STAT_WRITE_ERRS], + readErrors: deviceStats.values[_BTRFS_DEV_STAT_READ_ERRS], + flushErrors: deviceStats.values[_BTRFS_DEV_STAT_FLUSH_ERRS], + corruptionErrors: deviceStats.values[_BTRFS_DEV_STAT_CORRUPTION_ERRS], + generationErrors: deviceStats.values[_BTRFS_DEV_STAT_GENERATION_ERRS], + }) + + if uint64(len(devices)) == fsInfo.numDevices { + break + } + } + + return &ioctlFsStats{ + mountPoint: mountPoint, + uuid: fsInfo.fsID.String(), + devices: devices, + }, nil +} + // btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric. type btrfsMetric struct { name string @@ -72,14 +274,18 @@ type btrfsMetric struct { } // updateBtrfsStats collects statistics for one bcache ID. -func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats) { +func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, iocStats *ioctlFsStats) { const subsystem = "btrfs" // Basic information about the filesystem. devLabels := []string{"uuid"} + if iocStats != nil { + devLabels = append(devLabels, "mountpoint") + } + // Retrieve the metrics. - metrics := c.getMetrics(s) + metrics := c.getMetrics(s, iocStats) // Convert all gathered metrics to Prometheus Metrics and add to channel. for _, m := range metrics { @@ -93,6 +299,9 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs. ) labelValues := []string{s.UUID} + if iocStats != nil { + labelValues = append(labelValues, iocStats.mountPoint) + } if len(m.extraLabelValue) > 0 { labelValues = append(labelValues, m.extraLabelValue...) } @@ -107,7 +316,7 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs. } // getMetrics returns metrics for the given Btrfs statistics. -func (c *btrfsCollector) getMetrics(s *btrfs.Stats) []btrfsMetric { +func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []btrfsMetric { metrics := []btrfsMetric{ { name: "info", @@ -124,14 +333,73 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats) []btrfsMetric { } // Information about devices. - for n, dev := range s.Devices { - metrics = append(metrics, btrfsMetric{ - name: "device_size_bytes", - desc: "Size of a device that is part of the filesystem.", - value: float64(dev.Size), - extraLabel: []string{"device"}, - extraLabelValue: []string{n}, - }) + if iocStats == nil { + for n, dev := range s.Devices { + metrics = append(metrics, btrfsMetric{ + name: "device_size_bytes", + desc: "Size of a device that is part of the filesystem.", + value: float64(dev.Size), + extraLabel: []string{"device"}, + extraLabelValue: []string{n}, + }) + } + } else { + for _, dev := range iocStats.devices { + extraLabels := []string{"device", "device_uuid"} + extraLabelValues := []string{dev.path, dev.uuid} + metrics = append(metrics, + btrfsMetric{ + name: "device_size_bytes", + desc: "Size of a device that is part of the filesystem.", + value: float64(dev.totalBytes), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }, + btrfsMetric{ + name: "device_used_bytes", + desc: "Bytes used on a device that is part of the filesystem.", + value: float64(dev.bytesUsed), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }, + // TODO should the below metrics be a single metric with a varying 'error_type' label? + btrfsMetric{ + name: "device_write_errors", + desc: "TODO", + value: float64(dev.writeErrors), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }, + btrfsMetric{ + name: "device_read_errors", + desc: "TODO", + value: float64(dev.readErrors), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }, + btrfsMetric{ + name: "device_flush_errors", + desc: "TODO", + value: float64(dev.flushErrors), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }, + btrfsMetric{ + name: "device_corruption_errors", + desc: "TODO", + value: float64(dev.corruptionErrors), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }, + btrfsMetric{ + name: "device_generation_errors", + desc: "TODO", + value: float64(dev.generationErrors), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }, + ) + } } // Information about data, metadata and system data. diff --git a/collector/btrfs_linux_test.go b/collector/btrfs_linux_test.go index 7ce19aafab..eef90f64e9 100644 --- a/collector/btrfs_linux_test.go +++ b/collector/btrfs_linux_test.go @@ -104,7 +104,7 @@ func TestBtrfs(t *testing.T) { } for i, s := range stats { - metrics := collector.getMetrics(s) + metrics := collector.getMetrics(s, nil) if len(metrics) != len(expectedBtrfsMetrics[i]) { t.Fatalf("Unexpected number of Btrfs metrics: expected %v, got %v", len(expectedBtrfsMetrics[i]), len(metrics)) } From 219118b724c207c717a296a1317be2dbbf6be46e Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Thu, 11 Nov 2021 21:34:25 +0000 Subject: [PATCH 02/10] Improve metrics filesystem scanning logic Now has a better strategy for scanning mount points for filesystems, which will hopefully avoid duplicate work well. Metric names, labels and metadata have been improved. It no longer adds a mountpoint label to btrfs metrics, as I have come to better understand that there is no canonical mount point. Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 258 +++++++++++++++++++++------------------ 1 file changed, 138 insertions(+), 120 deletions(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index d56acfc3d5..f269ad500e 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -20,6 +20,7 @@ import ( "bytes" "fmt" "os" + "strings" "syscall" "unsafe" @@ -80,17 +81,38 @@ type ioctlFsDeviceStats struct { bytesUsed uint64 totalBytes uint64 - writeErrors uint64 - readErrors uint64 - flushErrors uint64 - corruptionErrors uint64 - generationErrors uint64 + errorStats map[string]uint64 } type ioctlFsStats struct { - uuid string - mountPoint string - devices []ioctlFsDeviceStats + uuid string + devices []ioctlFsDeviceStats +} + +// Magic constants for ioctl +//nolint:revive +const ( + _BTRFS_IOC_FS_INFO = 0x8400941F + _BTRFS_IOC_DEV_INFO = 0xD000941E + _BTRFS_IOC_GET_DEV_STATS = 0x00c4089434 +) + +type _UuidBytes [16]byte + +func (id _UuidBytes) String() string { + return fmt.Sprintf("%x-%x-%x-%x-%x", id[0:4], id[4:6], id[6:8], id[8:10], id[10:]) +} + +//name matches linux struct +//nolint:revive +type btrfs_ioctl_fs_info_args struct { + maxID uint64 // out + numDevices uint64 // out + fsID _UuidBytes // out + nodeSize uint32 // out + sectorSize uint32 // out + cloneAlignment uint32 // out + _ [122*8 + 4]byte // pad to 1k } func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { @@ -101,34 +123,61 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { return nil, err } - btrfsMounts := []string{} + // track devices we have successfully scanned, by device path + devicesDone := make(map[string]struct{}) + // filesystems scann results by UUID + fsStats := make(map[string]*ioctlFsStats) + for _, mount := range mountsList { - if mount.fsType == "btrfs" { - btrfsMounts = append(btrfsMounts, mount.mountPoint) + if mount.fsType != "btrfs" { + continue } - } - var statsMap = make(map[string]*ioctlFsStats, len(btrfsMounts)) + if _, found := devicesDone[mount.device]; found { + // We already found this filesystem by another mount point + continue + } - for _, mountPoint := range btrfsMounts { - ioctlStats, err := c.getIoctlFsStats(mountPoint) + fd, err := os.Open(mount.mountPoint) + if err != nil { + // failed to open this mount point, maybe we didn't have permission + // maybe we'll find another mount point for this FS later + continue + } + + var fsInfo = btrfs_ioctl_fs_info_args{} + _, _, errno := syscall.Syscall( + syscall.SYS_IOCTL, + fd.Fd(), + uintptr(_BTRFS_IOC_FS_INFO), + uintptr(unsafe.Pointer(&fsInfo))) + if errno != 0 { + // Failed to get the FS info for some reason, + // perhaps it'll work with a different mount point + continue + } + + fsID := fsInfo.fsID.String() + if _, found := fsStats[fsID]; found { + // We already found this filesystem by another mount point + continue + } + + deviceStats, err := c.getIoctlDeviceStats(fd, &fsInfo) if err != nil { return nil, err } - statsMap[ioctlStats.uuid] = ioctlStats + + devicesDone[mount.device] = struct{}{} + fsStats[fsID] = &ioctlFsStats{ + uuid: fsID, + devices: deviceStats, + } } - return statsMap, nil + return fsStats, nil } -// Magic constants for ioctl -//nolint:revive -const ( - _BTRFS_IOC_FS_INFO = 0x8400941F - _BTRFS_IOC_DEV_INFO = 0xD000941E - _BTRFS_IOC_GET_DEV_STATS = 0x00c4089434 -) - // Known/supported device stats fields //nolint:revive const ( @@ -146,22 +195,19 @@ const ( _BTRFS_DEV_STAT_VALUES_MAX // counter to indicate the number of known stats we support ) -type _UuidBytes [16]byte - -func (id _UuidBytes) String() string { - return fmt.Sprintf("%x-%x-%x-%x-%x", id[0:4], id[4:6], id[6:8], id[8:10], id[10:]) +var errorStatFields = []int{ + _BTRFS_DEV_STAT_WRITE_ERRS, + _BTRFS_DEV_STAT_READ_ERRS, + _BTRFS_DEV_STAT_FLUSH_ERRS, + _BTRFS_DEV_STAT_CORRUPTION_ERRS, + _BTRFS_DEV_STAT_GENERATION_ERRS, } - -//name matches linux struct -//nolint:revive -type btrfs_ioctl_fs_info_args struct { - maxID uint64 // out - numDevices uint64 // out - fsID _UuidBytes // out - nodeSize uint32 // out - sectorSize uint32 // out - cloneAlignment uint32 // out - _ [122*8 + 4]byte // pad to 1k +var errorStatNames = []string{ + "write", + "read", + "flush", + "corruption", + "generation", } //name matches linux struct @@ -185,22 +231,7 @@ type btrfs_ioctl_get_dev_stats struct { _ [128 - 2 - _BTRFS_DEV_STAT_VALUES_MAX]uint64 // pad to 1k } -func (c *btrfsCollector) getIoctlFsStats(mountPoint string) (*ioctlFsStats, error) { - fd, err := os.Open(mountPoint) - if err != nil { - return nil, err - } - - var fsInfo = btrfs_ioctl_fs_info_args{} - _, _, errno := syscall.Syscall( - syscall.SYS_IOCTL, - fd.Fd(), - _BTRFS_IOC_FS_INFO, - uintptr(unsafe.Pointer(&fsInfo))) - if errno != 0 { - return nil, err - } - +func (c *btrfsCollector) getIoctlDeviceStats(fd *os.File, fsInfo *btrfs_ioctl_fs_info_args) ([]ioctlFsDeviceStats, error) { devices := make([]ioctlFsDeviceStats, 0, fsInfo.numDevices) var deviceInfo btrfs_ioctl_dev_info_args @@ -239,17 +270,19 @@ func (c *btrfsCollector) getIoctlFsStats(mountPoint string) (*ioctlFsStats, erro return nil, errno } + errorStats := make(map[string]uint64, deviceStats.itemCount) + for i, fieldIndex := range errorStatFields { + if int(deviceStats.itemCount) >= fieldIndex { + errorStats[errorStatNames[i]] = deviceStats.values[fieldIndex] + } + } + devices = append(devices, ioctlFsDeviceStats{ path: string(bytes.Trim(deviceInfo.path[:], "\x00")), uuid: deviceInfo.uuid.String(), bytesUsed: deviceInfo.bytesUsed, totalBytes: deviceInfo.totalBytes, - - writeErrors: deviceStats.values[_BTRFS_DEV_STAT_WRITE_ERRS], - readErrors: deviceStats.values[_BTRFS_DEV_STAT_READ_ERRS], - flushErrors: deviceStats.values[_BTRFS_DEV_STAT_FLUSH_ERRS], - corruptionErrors: deviceStats.values[_BTRFS_DEV_STAT_CORRUPTION_ERRS], - generationErrors: deviceStats.values[_BTRFS_DEV_STAT_GENERATION_ERRS], + errorStats: errorStats, }) if uint64(len(devices)) == fsInfo.numDevices { @@ -257,16 +290,13 @@ func (c *btrfsCollector) getIoctlFsStats(mountPoint string) (*ioctlFsStats, erro } } - return &ioctlFsStats{ - mountPoint: mountPoint, - uuid: fsInfo.fsID.String(), - devices: devices, - }, nil + return devices, nil } // btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric. type btrfsMetric struct { name string + metricType prometheus.ValueType desc string value float64 extraLabel []string @@ -280,10 +310,6 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs. // Basic information about the filesystem. devLabels := []string{"uuid"} - if iocStats != nil { - devLabels = append(devLabels, "mountpoint") - } - // Retrieve the metrics. metrics := c.getMetrics(s, iocStats) @@ -299,16 +325,13 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs. ) labelValues := []string{s.UUID} - if iocStats != nil { - labelValues = append(labelValues, iocStats.mountPoint) - } if len(m.extraLabelValue) > 0 { labelValues = append(labelValues, m.extraLabelValue...) } ch <- prometheus.MustNewConstMetric( desc, - prometheus.GaugeValue, + m.metricType, m.value, labelValues..., ) @@ -322,13 +345,15 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []bt name: "info", desc: "Filesystem information", value: 1, + metricType: prometheus.GaugeValue, extraLabel: []string{"label"}, extraLabelValue: []string{s.Label}, }, { - name: "global_rsv_size_bytes", - desc: "Size of global reserve.", - value: float64(s.Allocation.GlobalRsvSize), + name: "global_rsv_size_bytes", + desc: "Size of global reserve.", + metricType: prometheus.GaugeValue, + value: float64(s.Allocation.GlobalRsvSize), }, } @@ -338,6 +363,7 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []bt metrics = append(metrics, btrfsMetric{ name: "device_size_bytes", desc: "Size of a device that is part of the filesystem.", + metricType: prometheus.GaugeValue, value: float64(dev.Size), extraLabel: []string{"device"}, extraLabelValue: []string{n}, @@ -345,60 +371,48 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []bt } } else { for _, dev := range iocStats.devices { + // trim the /dev/ prefix from the device name so the value should match + // the value used in the fallback branch above + device := strings.TrimPrefix(dev.path, "/dev/") + extraLabels := []string{"device", "device_uuid"} - extraLabelValues := []string{dev.path, dev.uuid} + extraLabelValues := []string{device, dev.uuid} + metrics = append(metrics, btrfsMetric{ name: "device_size_bytes", desc: "Size of a device that is part of the filesystem.", + metricType: prometheus.GaugeValue, value: float64(dev.totalBytes), extraLabel: extraLabels, extraLabelValue: extraLabelValues, }, + // A bytes available metric is probably more useful than a + // bytes used metric, because large numbers of bytes will + // suffer from floating point representation issues + // and we probably care more about the number when it's low anyway btrfsMetric{ - name: "device_used_bytes", - desc: "Bytes used on a device that is part of the filesystem.", - value: float64(dev.bytesUsed), - extraLabel: extraLabels, - extraLabelValue: extraLabelValues, - }, - // TODO should the below metrics be a single metric with a varying 'error_type' label? - btrfsMetric{ - name: "device_write_errors", - desc: "TODO", - value: float64(dev.writeErrors), - extraLabel: extraLabels, - extraLabelValue: extraLabelValues, - }, - btrfsMetric{ - name: "device_read_errors", - desc: "TODO", - value: float64(dev.readErrors), + name: "device_unused_bytes", + desc: "Unused bytes unused on a device that is part of the filesystem.", + metricType: prometheus.GaugeValue, + value: float64(dev.totalBytes - dev.bytesUsed), extraLabel: extraLabels, extraLabelValue: extraLabelValues, - }, - btrfsMetric{ - name: "device_flush_errors", - desc: "TODO", - value: float64(dev.flushErrors), - extraLabel: extraLabels, - extraLabelValue: extraLabelValues, - }, - btrfsMetric{ - name: "device_corruption_errors", - desc: "TODO", - value: float64(dev.corruptionErrors), - extraLabel: extraLabels, - extraLabelValue: extraLabelValues, - }, - btrfsMetric{ - name: "device_generation_errors", - desc: "TODO", - value: float64(dev.generationErrors), - extraLabel: extraLabels, - extraLabelValue: extraLabelValues, - }, - ) + }) + + errorLabels := append([]string{"type"}, extraLabels...) + for errorName, count := range dev.errorStats { + errorLabelValues := append([]string{errorName}, extraLabelValues...) + metrics = append(metrics, + btrfsMetric{ + name: "device_errors_total", + desc: "Errors reported for the device", + metricType: prometheus.CounterValue, + value: float64(count), + extraLabel: errorLabels, + extraLabelValue: errorLabelValues, + }) + } } } @@ -416,6 +430,7 @@ func (c *btrfsCollector) getAllocationStats(a string, s *btrfs.AllocationStats) { name: "reserved_bytes", desc: "Amount of space reserved for a data type", + metricType: prometheus.GaugeValue, value: float64(s.ReservedBytes), extraLabel: []string{"block_group_type"}, extraLabelValue: []string{a}, @@ -436,6 +451,7 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr { name: "used_bytes", desc: "Amount of used space by a layout/data type", + metricType: prometheus.GaugeValue, value: float64(s.UsedBytes), extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{a, l}, @@ -443,6 +459,7 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr { name: "size_bytes", desc: "Amount of space allocated for a layout/data type", + metricType: prometheus.GaugeValue, value: float64(s.TotalBytes), extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{a, l}, @@ -450,6 +467,7 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr { name: "allocation_ratio", desc: "Data allocation ratio for a layout/data type", + metricType: prometheus.GaugeValue, value: s.Ratio, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{a, l}, From 62ed834481dfae5240b03be39188a0701cb7b490 Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Sat, 28 May 2022 21:11:30 +0100 Subject: [PATCH 03/10] Move to using dennwc/btrfs for ioctl Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 198 +++++++++++---------------------------- go.mod | 1 + go.sum | 4 + 3 files changed, 59 insertions(+), 144 deletions(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index f269ad500e..6efa9a988b 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -17,13 +17,11 @@ package collector import ( - "bytes" "fmt" - "os" "strings" "syscall" - "unsafe" + dennwc "github.com/dennwc/btrfs" "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/procfs/btrfs" @@ -67,13 +65,21 @@ func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error { for _, s := range stats { // match up procfs and ioctl info by filesystem UUID - ioctlStats := ioctlStatsMap[s.UUID] + ioctlStats := ioctlStatsMap[strings.Replace(s.UUID, "-", "", -1)] c.updateBtrfsStats(ch, s, ioctlStats) } return nil } +var errorStatTypeNames = []string{ + "write", + "read", + "flush", + "corruption", + "generation", +} + type ioctlFsDeviceStats struct { path string uuid string @@ -81,7 +87,11 @@ type ioctlFsDeviceStats struct { bytesUsed uint64 totalBytes uint64 - errorStats map[string]uint64 + writeErrs uint64 + readErrs uint64 + flushErrs uint64 + corruptionErrs uint64 + generationErrs uint64 } type ioctlFsStats struct { @@ -89,32 +99,6 @@ type ioctlFsStats struct { devices []ioctlFsDeviceStats } -// Magic constants for ioctl -//nolint:revive -const ( - _BTRFS_IOC_FS_INFO = 0x8400941F - _BTRFS_IOC_DEV_INFO = 0xD000941E - _BTRFS_IOC_GET_DEV_STATS = 0x00c4089434 -) - -type _UuidBytes [16]byte - -func (id _UuidBytes) String() string { - return fmt.Sprintf("%x-%x-%x-%x-%x", id[0:4], id[4:6], id[6:8], id[8:10], id[10:]) -} - -//name matches linux struct -//nolint:revive -type btrfs_ioctl_fs_info_args struct { - maxID uint64 // out - numDevices uint64 // out - fsID _UuidBytes // out - nodeSize uint32 // out - sectorSize uint32 // out - cloneAlignment uint32 // out - _ [122*8 + 4]byte // pad to 1k -} - func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { // Instead of introducing more ioctl calls to scan for all btrfs // filesytems re-use our mount point utils to find known mounts @@ -138,32 +122,27 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { continue } - fd, err := os.Open(mount.mountPoint) + fs, err := dennwc.Open(mount.mountPoint, true) if err != nil { // failed to open this mount point, maybe we didn't have permission // maybe we'll find another mount point for this FS later continue } - var fsInfo = btrfs_ioctl_fs_info_args{} - _, _, errno := syscall.Syscall( - syscall.SYS_IOCTL, - fd.Fd(), - uintptr(_BTRFS_IOC_FS_INFO), - uintptr(unsafe.Pointer(&fsInfo))) - if errno != 0 { + fsInfo, err := fs.Info() + if err != nil { // Failed to get the FS info for some reason, // perhaps it'll work with a different mount point continue } - fsID := fsInfo.fsID.String() + fsID := fsInfo.FSID.String() if _, found := fsStats[fsID]; found { // We already found this filesystem by another mount point continue } - deviceStats, err := c.getIoctlDeviceStats(fd, &fsInfo) + deviceStats, err := c.getIoctlDeviceStats(fs, &fsInfo) if err != nil { return nil, err } @@ -178,114 +157,39 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { return fsStats, nil } -// Known/supported device stats fields -//nolint:revive -const ( - // direct indicators of I/O failures: - - _BTRFS_DEV_STAT_WRITE_ERRS = iota - _BTRFS_DEV_STAT_READ_ERRS - _BTRFS_DEV_STAT_FLUSH_ERRS +func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]ioctlFsDeviceStats, error) { + devices := make([]ioctlFsDeviceStats, 0, fsInfo.NumDevices) - // indirect indicators of I/O failures: - - _BTRFS_DEV_STAT_CORRUPTION_ERRS // checksum error, bytenr error or contents is illegal - _BTRFS_DEV_STAT_GENERATION_ERRS // an indication that blocks have not been written - - _BTRFS_DEV_STAT_VALUES_MAX // counter to indicate the number of known stats we support -) + for i := uint64(0); i <= fsInfo.MaxID; i++ { + deviceInfo, err := fs.GetDevInfo(i) -var errorStatFields = []int{ - _BTRFS_DEV_STAT_WRITE_ERRS, - _BTRFS_DEV_STAT_READ_ERRS, - _BTRFS_DEV_STAT_FLUSH_ERRS, - _BTRFS_DEV_STAT_CORRUPTION_ERRS, - _BTRFS_DEV_STAT_GENERATION_ERRS, -} -var errorStatNames = []string{ - "write", - "read", - "flush", - "corruption", - "generation", -} - -//name matches linux struct -//nolint:revive -type btrfs_ioctl_dev_info_args struct { - deviceID uint64 // in/out - uuid _UuidBytes // in/out - bytesUsed uint64 // out - totalBytes uint64 // out - _ [379]uint64 // pad to 4k - path [1024]byte // out -} - -//name matches linux struct -//nolint:revive -type btrfs_ioctl_get_dev_stats struct { - deviceID uint64 // in - itemCount uint64 // in/out - flags uint64 // in/out - values [_BTRFS_DEV_STAT_VALUES_MAX]uint64 // out values - _ [128 - 2 - _BTRFS_DEV_STAT_VALUES_MAX]uint64 // pad to 1k -} - -func (c *btrfsCollector) getIoctlDeviceStats(fd *os.File, fsInfo *btrfs_ioctl_fs_info_args) ([]ioctlFsDeviceStats, error) { - devices := make([]ioctlFsDeviceStats, 0, fsInfo.numDevices) - - var deviceInfo btrfs_ioctl_dev_info_args - var deviceStats btrfs_ioctl_get_dev_stats - - for i := uint64(0); i <= fsInfo.maxID; i++ { - deviceInfo = btrfs_ioctl_dev_info_args{ - deviceID: i, - } - deviceStats = btrfs_ioctl_get_dev_stats{ - deviceID: i, - itemCount: _BTRFS_DEV_STAT_VALUES_MAX, - } - - _, _, errno := syscall.Syscall( - syscall.SYS_IOCTL, - fd.Fd(), - uintptr(_BTRFS_IOC_DEV_INFO), - uintptr(unsafe.Pointer(&deviceInfo))) - - if errno == syscall.ENODEV { - // device IDs do not consistently start at 0, so we expect this - continue - } - if errno != 0 { - return nil, errno - } - - _, _, errno = syscall.Syscall( - syscall.SYS_IOCTL, - fd.Fd(), - uintptr(_BTRFS_IOC_GET_DEV_STATS), - uintptr(unsafe.Pointer(&deviceStats))) - - if errno != 0 { - return nil, errno + if err != nil { + if errno, ok := err.(syscall.Errno); ok && errno == syscall.ENODEV { + // device IDs do not consistently start at 0, nor are ranges contiguous, so we expect this + continue + } + return nil, err } - errorStats := make(map[string]uint64, deviceStats.itemCount) - for i, fieldIndex := range errorStatFields { - if int(deviceStats.itemCount) >= fieldIndex { - errorStats[errorStatNames[i]] = deviceStats.values[fieldIndex] - } + deviceStats, err := fs.GetDevStats(i) + if err != nil { + return nil, err } devices = append(devices, ioctlFsDeviceStats{ - path: string(bytes.Trim(deviceInfo.path[:], "\x00")), - uuid: deviceInfo.uuid.String(), - bytesUsed: deviceInfo.bytesUsed, - totalBytes: deviceInfo.totalBytes, - errorStats: errorStats, + path: deviceInfo.Path, + uuid: deviceInfo.UUID.String(), + bytesUsed: deviceInfo.BytesUsed, + totalBytes: deviceInfo.TotalBytes, + + writeErrs: deviceStats.WriteErrs, + readErrs: deviceStats.ReadErrs, + flushErrs: deviceStats.FlushErrs, + corruptionErrs: deviceStats.CorruptionErrs, + generationErrs: deviceStats.GenerationErrs, }) - if uint64(len(devices)) == fsInfo.numDevices { + if uint64(len(devices)) == fsInfo.NumDevices { break } } @@ -401,16 +305,22 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []bt }) errorLabels := append([]string{"type"}, extraLabels...) - for errorName, count := range dev.errorStats { - errorLabelValues := append([]string{errorName}, extraLabelValues...) + values := []uint64{ + dev.writeErrs, + dev.readErrs, + dev.flushErrs, + dev.corruptionErrs, + dev.generationErrs, + } + for i, errorType := range errorStatTypeNames { metrics = append(metrics, btrfsMetric{ name: "device_errors_total", desc: "Errors reported for the device", metricType: prometheus.CounterValue, - value: float64(count), + value: float64(values[i]), extraLabel: errorLabels, - extraLabelValue: errorLabelValues, + extraLabelValue: append([]string{errorType}, extraLabelValues...), }) } } diff --git a/go.mod b/go.mod index 6b44eea957..993c4c1f0b 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/prometheus/node_exporter require ( github.com/beevik/ntp v0.3.0 github.com/coreos/go-systemd/v22 v22.3.2 + github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00 github.com/go-kit/log v0.2.0 github.com/godbus/dbus/v5 v5.1.0 diff --git a/go.sum b/go.sum index be0a3f889c..3d9c76a550 100644 --- a/go.sum +++ b/go.sum @@ -63,6 +63,10 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac h1:cjfIEKdg/lZ+ewJk8FYoYZTN2HY/okVSPtmHLaYMAvE= +github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac/go.mod h1:MYsOV9Dgsec3FFSOjywi0QK5r6TeBbdWxdrMGtiYXHA= +github.com/dennwc/ioctl v1.0.0 h1:DsWAAjIxRqNcLn9x6mwfuf2pet3iB7aK90K4tF16rLg= +github.com/dennwc/ioctl v1.0.0/go.mod h1:ellh2YB5ldny99SBU/VX7Nq0xiZbHphf1DrtHxxjMk0= github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00 h1:0GHzegkDz/zSrt+Zph1OueNImPdUxoToypnkhhRYTjI= github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00/go.mod h1:ix4kG2zvdUd8kEKSW0ZTr1XLks0epFpI4j745DXxlNE= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= From ce492ebfef845502f22791c3e1600d8360dac9c0 Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Sat, 28 May 2022 22:07:37 +0100 Subject: [PATCH 04/10] Add debug logging Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index 6efa9a988b..3e0d4335e9 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -23,6 +23,7 @@ import ( dennwc "github.com/dennwc/btrfs" "github.com/go-kit/log" + "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/procfs/btrfs" ) @@ -126,6 +127,10 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { if err != nil { // failed to open this mount point, maybe we didn't have permission // maybe we'll find another mount point for this FS later + level.Debug(c.logger).Log( + "msg", "Error inspecting btrfs mountpoint", + "mountPoint", mount.mountPoint, + "err", err) continue } @@ -133,6 +138,10 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { if err != nil { // Failed to get the FS info for some reason, // perhaps it'll work with a different mount point + level.Debug(c.logger).Log( + "msg", "Error querying btrfs filesystem", + "mountPoint", mount.mountPoint, + "err", err) continue } @@ -144,6 +153,10 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { deviceStats, err := c.getIoctlDeviceStats(fs, &fsInfo) if err != nil { + level.Debug(c.logger).Log( + "msg", "Error querying btrfs device stats", + "mountPoint", mount.mountPoint, + "err", err) return nil, err } From dd86d34d418f9a6331bea002297596a2deec4cb8 Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Fri, 24 Jun 2022 22:11:46 +0100 Subject: [PATCH 05/10] Move list inline and improve variable name Also add some breadcrumbs explaining where the error stats come from. Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index 3e0d4335e9..1c54669586 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -73,14 +73,6 @@ func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error { return nil } -var errorStatTypeNames = []string{ - "write", - "read", - "flush", - "corruption", - "generation", -} - type ioctlFsDeviceStats struct { path string uuid string @@ -88,6 +80,9 @@ type ioctlFsDeviceStats struct { bytesUsed uint64 totalBytes uint64 + // The error stats below match the following upstream lists: + // https://github.com/dennwc/btrfs/blob/b3db0b2dedac3bf580f412034d77e0bf4b420167/btrfs.go#L132-L140 + // https://github.com/torvalds/linux/blob/70d605cbeecb408dd884b1f0cd3963eeeaac144c/include/uapi/linux/btrfs.h#L680-L692 writeErrs uint64 readErrs uint64 flushErrs uint64 @@ -325,7 +320,15 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []bt dev.corruptionErrs, dev.generationErrs, } - for i, errorType := range errorStatTypeNames { + btrfsErrorTypeNames := []string{ + "write", + "read", + "flush", + "corruption", + "generation", + } + + for i, errorType := range btrfsErrorTypeNames { metrics = append(metrics, btrfsMetric{ name: "device_errors_total", From 6f681263e61e791d724f2a7ca52d1149a3e7639a Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Fri, 24 Jun 2022 22:14:29 +0100 Subject: [PATCH 06/10] Reduce control flow complexity Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 117 +++++++++++++++++----------------- collector/btrfs_linux_test.go | 12 ++-- 2 files changed, 65 insertions(+), 64 deletions(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index 1c54669586..939f6266a5 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -269,6 +269,11 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []bt }, } + // Information about data, metadata and system data. + metrics = append(metrics, c.getAllocationStats("data", s.Allocation.Data)...) + metrics = append(metrics, c.getAllocationStats("metadata", s.Allocation.Metadata)...) + metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...) + // Information about devices. if iocStats == nil { for n, dev := range s.Devices { @@ -281,72 +286,68 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []bt extraLabelValue: []string{n}, }) } - } else { - for _, dev := range iocStats.devices { - // trim the /dev/ prefix from the device name so the value should match - // the value used in the fallback branch above - device := strings.TrimPrefix(dev.path, "/dev/") + return metrics + } + + for _, dev := range iocStats.devices { + // trim the /dev/ prefix from the device name so the value should match + // the value used in the fallback branch above + device := strings.TrimPrefix(dev.path, "/dev/") - extraLabels := []string{"device", "device_uuid"} - extraLabelValues := []string{device, dev.uuid} + extraLabels := []string{"device", "device_uuid"} + extraLabelValues := []string{device, dev.uuid} + metrics = append(metrics, + btrfsMetric{ + name: "device_size_bytes", + desc: "Size of a device that is part of the filesystem.", + metricType: prometheus.GaugeValue, + value: float64(dev.totalBytes), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }, + // A bytes available metric is probably more useful than a + // bytes used metric, because large numbers of bytes will + // suffer from floating point representation issues + // and we probably care more about the number when it's low anyway + btrfsMetric{ + name: "device_unused_bytes", + desc: "Unused bytes unused on a device that is part of the filesystem.", + metricType: prometheus.GaugeValue, + value: float64(dev.totalBytes - dev.bytesUsed), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }) + + errorLabels := append([]string{"type"}, extraLabels...) + values := []uint64{ + dev.writeErrs, + dev.readErrs, + dev.flushErrs, + dev.corruptionErrs, + dev.generationErrs, + } + btrfsErrorTypeNames := []string{ + "write", + "read", + "flush", + "corruption", + "generation", + } + + for i, errorType := range btrfsErrorTypeNames { metrics = append(metrics, btrfsMetric{ - name: "device_size_bytes", - desc: "Size of a device that is part of the filesystem.", - metricType: prometheus.GaugeValue, - value: float64(dev.totalBytes), - extraLabel: extraLabels, - extraLabelValue: extraLabelValues, - }, - // A bytes available metric is probably more useful than a - // bytes used metric, because large numbers of bytes will - // suffer from floating point representation issues - // and we probably care more about the number when it's low anyway - btrfsMetric{ - name: "device_unused_bytes", - desc: "Unused bytes unused on a device that is part of the filesystem.", - metricType: prometheus.GaugeValue, - value: float64(dev.totalBytes - dev.bytesUsed), - extraLabel: extraLabels, - extraLabelValue: extraLabelValues, + name: "device_errors_total", + desc: "Errors reported for the device", + metricType: prometheus.CounterValue, + value: float64(values[i]), + extraLabel: errorLabels, + extraLabelValue: append([]string{errorType}, extraLabelValues...), }) - - errorLabels := append([]string{"type"}, extraLabels...) - values := []uint64{ - dev.writeErrs, - dev.readErrs, - dev.flushErrs, - dev.corruptionErrs, - dev.generationErrs, - } - btrfsErrorTypeNames := []string{ - "write", - "read", - "flush", - "corruption", - "generation", - } - - for i, errorType := range btrfsErrorTypeNames { - metrics = append(metrics, - btrfsMetric{ - name: "device_errors_total", - desc: "Errors reported for the device", - metricType: prometheus.CounterValue, - value: float64(values[i]), - extraLabel: errorLabels, - extraLabelValue: append([]string{errorType}, extraLabelValues...), - }) - } } } - // Information about data, metadata and system data. - metrics = append(metrics, c.getAllocationStats("data", s.Allocation.Data)...) - metrics = append(metrics, c.getAllocationStats("metadata", s.Allocation.Metadata)...) - metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...) - return metrics } diff --git a/collector/btrfs_linux_test.go b/collector/btrfs_linux_test.go index eef90f64e9..fcd6f9f995 100644 --- a/collector/btrfs_linux_test.go +++ b/collector/btrfs_linux_test.go @@ -27,8 +27,6 @@ var expectedBtrfsMetrics = [][]btrfsMetric{ { {name: "info", value: 1, extraLabel: []string{"label"}, extraLabelValue: []string{"fixture"}}, {name: "global_rsv_size_bytes", value: 1.6777216e+07}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop26"}}, {name: "reserved_bytes", value: 0, extraLabel: []string{"block_group_type"}, extraLabelValue: []string{"data"}}, {name: "used_bytes", value: 8.08189952e+08, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid0"}}, {name: "size_bytes", value: 2.147483648e+09, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid0"}}, @@ -41,14 +39,12 @@ var expectedBtrfsMetrics = [][]btrfsMetric{ {name: "used_bytes", value: 16384, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid1"}}, {name: "size_bytes", value: 8.388608e+06, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid1"}}, {name: "allocation_ratio", value: 2, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid1"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop26"}}, }, { {name: "info", value: 1, extraLabel: []string{"label"}, extraLabelValue: []string{""}}, {name: "global_rsv_size_bytes", value: 1.6777216e+07}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop22"}}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop23"}}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop24"}}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}}, {name: "reserved_bytes", value: 0, extraLabel: []string{"block_group_type"}, extraLabelValue: []string{"data"}}, {name: "used_bytes", value: 0, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid5"}}, {name: "size_bytes", value: 6.44087808e+08, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid5"}}, @@ -61,6 +57,10 @@ var expectedBtrfsMetrics = [][]btrfsMetric{ {name: "used_bytes", value: 16384, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid6"}}, {name: "size_bytes", value: 1.6777216e+07, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid6"}}, {name: "allocation_ratio", value: 2, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid6"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop22"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop23"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop24"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}}, }, } From 383657ad8ae1f0d038f8ad07d787c39e20a6c125 Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Fri, 24 Jun 2022 22:18:03 +0100 Subject: [PATCH 07/10] Don't panic if we failed to find stats for some devices Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index 939f6266a5..97def00276 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -61,12 +61,16 @@ func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error { ioctlStatsMap, err := c.getIoctlStats() if err != nil { - return fmt.Errorf("failed to retrieve Btrfs stats with ioctl: %w", err) + level.Debug(c.logger).Log( + "msg", "Error querying btrfs device stats with ioctl", + "err", err) + ioctlStatsMap = make(map[string]*btrfsIoctlFsStats) } for _, s := range stats { - // match up procfs and ioctl info by filesystem UUID - ioctlStats := ioctlStatsMap[strings.Replace(s.UUID, "-", "", -1)] + // match up procfs and ioctl info by filesystem UUID (without dashes) + var fsUUID = strings.Replace(s.UUID, "-", "", -1) + ioctlStats := ioctlStatsMap[fsUUID] c.updateBtrfsStats(ch, s, ioctlStats) } @@ -152,7 +156,7 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { "msg", "Error querying btrfs device stats", "mountPoint", mount.mountPoint, "err", err) - return nil, err + continue } devicesDone[mount.device] = struct{}{} From 3c5ba8521159ad2175037b0024a38aa588b5292f Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Fri, 24 Jun 2022 22:21:50 +0100 Subject: [PATCH 08/10] Improve the names of things Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index 97def00276..7711d0ab5b 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -77,7 +77,7 @@ func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error { return nil } -type ioctlFsDeviceStats struct { +type btrfsIoctlFsDevStats struct { path string uuid string @@ -94,12 +94,12 @@ type ioctlFsDeviceStats struct { generationErrs uint64 } -type ioctlFsStats struct { +type btrfsIoctlFsStats struct { uuid string - devices []ioctlFsDeviceStats + devices []btrfsIoctlFsDevStats } -func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { +func (c *btrfsCollector) getIoctlStats() (map[string]*btrfsIoctlFsStats, error) { // Instead of introducing more ioctl calls to scan for all btrfs // filesytems re-use our mount point utils to find known mounts mountsList, err := mountPointDetails(c.logger) @@ -110,7 +110,7 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { // track devices we have successfully scanned, by device path devicesDone := make(map[string]struct{}) // filesystems scann results by UUID - fsStats := make(map[string]*ioctlFsStats) + fsStats := make(map[string]*btrfsIoctlFsStats) for _, mount := range mountsList { if mount.fsType != "btrfs" { @@ -160,7 +160,7 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { } devicesDone[mount.device] = struct{}{} - fsStats[fsID] = &ioctlFsStats{ + fsStats[fsID] = &btrfsIoctlFsStats{ uuid: fsID, devices: deviceStats, } @@ -169,8 +169,8 @@ func (c *btrfsCollector) getIoctlStats() (map[string]*ioctlFsStats, error) { return fsStats, nil } -func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]ioctlFsDeviceStats, error) { - devices := make([]ioctlFsDeviceStats, 0, fsInfo.NumDevices) +func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]btrfsIoctlFsDevStats, error) { + devices := make([]btrfsIoctlFsDevStats, 0, fsInfo.NumDevices) for i := uint64(0); i <= fsInfo.MaxID; i++ { deviceInfo, err := fs.GetDevInfo(i) @@ -188,7 +188,7 @@ func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) return nil, err } - devices = append(devices, ioctlFsDeviceStats{ + devices = append(devices, btrfsIoctlFsDevStats{ path: deviceInfo.Path, uuid: deviceInfo.UUID.String(), bytesUsed: deviceInfo.BytesUsed, @@ -220,14 +220,14 @@ type btrfsMetric struct { } // updateBtrfsStats collects statistics for one bcache ID. -func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, iocStats *ioctlFsStats) { +func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) { const subsystem = "btrfs" // Basic information about the filesystem. devLabels := []string{"uuid"} // Retrieve the metrics. - metrics := c.getMetrics(s, iocStats) + metrics := c.getMetrics(s, ioctlStats) // Convert all gathered metrics to Prometheus Metrics and add to channel. for _, m := range metrics { @@ -255,7 +255,7 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs. } // getMetrics returns metrics for the given Btrfs statistics. -func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []btrfsMetric { +func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) []btrfsMetric { metrics := []btrfsMetric{ { name: "info", @@ -279,7 +279,7 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []bt metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...) // Information about devices. - if iocStats == nil { + if ioctlStats == nil { for n, dev := range s.Devices { metrics = append(metrics, btrfsMetric{ name: "device_size_bytes", @@ -293,7 +293,7 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, iocStats *ioctlFsStats) []bt return metrics } - for _, dev := range iocStats.devices { + for _, dev := range ioctlStats.devices { // trim the /dev/ prefix from the device name so the value should match // the value used in the fallback branch above device := strings.TrimPrefix(dev.path, "/dev/") From d3cca54f2bd10eb400cccd7d7b1f79047a5d81e2 Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Fri, 24 Jun 2022 22:26:00 +0100 Subject: [PATCH 09/10] Improve getting device name from path Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index 7711d0ab5b..c90d96fd89 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -18,6 +18,7 @@ package collector import ( "fmt" + "path" "strings" "syscall" @@ -294,9 +295,10 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStat } for _, dev := range ioctlStats.devices { - // trim the /dev/ prefix from the device name so the value should match - // the value used in the fallback branch above - device := strings.TrimPrefix(dev.path, "/dev/") + // trim the path prefix from the device name so the value should match + // the value used in the fallback branch above. + // e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1 + _, device := path.Split(dev.path) extraLabels := []string{"device", "device_uuid"} extraLabelValues := []string{device, dev.uuid} From 8b7335d93e720a8b7dbbfd7adbe7bdf62e55a7c0 Mon Sep 17 00:00:00 2001 From: Marcus Cobden Date: Mon, 25 Jul 2022 20:42:07 +0100 Subject: [PATCH 10/10] Rename device_uuid label to btrfs_dev_uuid It does not appear to be a device UUID, so attempt to be less misleading but keep the label to preserve separation between disks. Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index c90d96fd89..bb1820ffc1 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -300,7 +300,7 @@ func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStat // e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1 _, device := path.Split(dev.path) - extraLabels := []string{"device", "device_uuid"} + extraLabels := []string{"device", "btrfs_dev_uuid"} extraLabelValues := []string{device, dev.uuid} metrics = append(metrics,