From 688c3c4f44a53d791ecd4f790856f2d862c8388d Mon Sep 17 00:00:00 2001 From: HarpPDX Date: Fri, 6 Feb 2026 18:47:21 -0800 Subject: [PATCH 1/7] feature: linux kernel telemetry --- internal/script/scripts.go | 82 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/internal/script/scripts.go b/internal/script/scripts.go index af856e87..c2ee17e9 100644 --- a/internal/script/scripts.go +++ b/internal/script/scripts.go @@ -1441,6 +1441,88 @@ for ((i=0; i Date: Sat, 7 Feb 2026 08:36:14 -0800 Subject: [PATCH 2/7] tables and renderers --- cmd/telemetry/telemetry.go | 52 ++++++++------ cmd/telemetry/telemetry_renderers.go | 70 ++++++++++++++++++ cmd/telemetry/telemetry_tables.go | 104 +++++++++++++++++++++++++++ internal/script/scripts.go | 3 +- 4 files changed, 206 insertions(+), 23 deletions(-) diff --git a/cmd/telemetry/telemetry.go b/cmd/telemetry/telemetry.go index 5c8864b8..3ac107f4 100644 --- a/cmd/telemetry/telemetry.go +++ b/cmd/telemetry/telemetry.go @@ -52,17 +52,19 @@ var ( flagAll bool - flagCPU bool - flagFrequency bool - flagIPC bool - flagC6 bool - flagIRQRate bool - flagMemory bool - flagNetwork bool - flagStorage bool - flagPower bool - flagTemperature bool - flagInstrMix bool + flagCPU bool + flagFrequency bool + flagIPC bool + flagC6 bool + flagIRQRate bool + flagMemory bool + flagNetwork bool + flagStorage bool + flagPower bool + flagTemperature bool + flagInstrMix bool + flagVirtualMemory bool + flagProcess bool flagNoSystemSummary bool @@ -76,17 +78,19 @@ const ( flagAllName = "all" - flagCPUName = "cpu" - flagFrequencyName = "frequency" - flagIPCName = "ipc" - flagC6Name = "c6" - flagIRQRateName = "irqrate" - flagMemoryName = "memory" - flagNetworkName = "network" - flagStorageName = "storage" - flagPowerName = "power" - flagTemperatureName = "temperature" - flagInstrMixName = "instrmix" + flagCPUName = "cpu" + flagFrequencyName = "frequency" + flagIPCName = "ipc" + flagC6Name = "c6" + flagIRQRateName = "irqrate" + flagMemoryName = "memory" + flagNetworkName = "network" + flagStorageName = "storage" + flagPowerName = "power" + flagTemperatureName = "temperature" + flagInstrMixName = "instrmix" + flagVirtualMemoryName = "virtual-memory" + flagProcessName = "process" flagNoSystemSummaryName = "no-summary" @@ -108,6 +112,8 @@ var categories = []app.Category{ {FlagName: flagStorageName, FlagVar: &flagStorage, DefaultValue: false, Help: "monitor storage", Tables: []table.TableDefinition{tableDefinitions[DriveTelemetryTableName]}}, {FlagName: flagIRQRateName, FlagVar: &flagIRQRate, DefaultValue: false, Help: "monitor IRQ rate", Tables: []table.TableDefinition{tableDefinitions[IRQRateTelemetryTableName]}}, {FlagName: flagInstrMixName, FlagVar: &flagInstrMix, DefaultValue: false, Help: "monitor instruction mix", Tables: []table.TableDefinition{tableDefinitions[InstructionTelemetryTableName]}}, + {FlagName: flagVirtualMemoryName, FlagVar: &flagVirtualMemory, DefaultValue: false, Help: "monitor virtual memory", Tables: []table.TableDefinition{tableDefinitions[VirtualMemoryTelemetryTableName]}}, + {FlagName: flagProcessName, FlagVar: &flagProcess, DefaultValue: false, Help: "monitor process telemetry", Tables: []table.TableDefinition{tableDefinitions[ProcessTelemetryTableName]}}, } const ( @@ -338,6 +344,8 @@ func runCmd(cmd *cobra.Command, args []string) error { report.RegisterHTMLRenderer(InstructionTelemetryTableName, instructionTelemetryTableHTMLRenderer) report.RegisterHTMLRenderer(GaudiTelemetryTableName, gaudiTelemetryTableHTMLRenderer) report.RegisterHTMLRenderer(PDUTelemetryTableName, pduTelemetryTableHTMLRenderer) + report.RegisterHTMLRenderer(VirtualMemoryTelemetryTableName, virtualMemoryTelemetryTableHTMLRenderer) + report.RegisterHTMLRenderer(ProcessTelemetryTableName, processTelemetryTableHTMLRenderer) return reportingCommand.Run() } diff --git a/cmd/telemetry/telemetry_renderers.go b/cmd/telemetry/telemetry_renderers.go index f88bd392..c1695d42 100644 --- a/cmd/telemetry/telemetry_renderers.go +++ b/cmd/telemetry/telemetry_renderers.go @@ -673,3 +673,73 @@ func pduTelemetryTableHTMLRenderer(tableValues table.TableValues, targetName str } return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig, nil) } + +func virtualMemoryTelemetryTableHTMLRenderer(tableValues table.TableValues, targetName string) string { + data := [][]float64{} + datasetNames := []string{} + for _, field := range tableValues.Fields[1:] { + points := []float64{} + for _, val := range field.Values { + if val == "" { + break + } + stat, err := strconv.ParseFloat(val, 64) + if err != nil { + slog.Error("error parsing stat", slog.String("error", err.Error())) + return "" + } + points = append(points, stat) + } + if len(points) > 0 { + data = append(data, points) + datasetNames = append(datasetNames, field.Name) + } + } + chartConfig := report.ChartTemplateStruct{ + ID: fmt.Sprintf("%s%d", tableValues.Name, util.RandUint(10000)), + XaxisText: "Time", + YaxisText: "count per second", + TitleText: "", + DisplayTitle: "false", + DisplayLegend: "true", + AspectRatio: "2", + SuggestedMin: "0", + SuggestedMax: "0", + } + return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig, nil) +} + +func processTelemetryTableHTMLRenderer(tableValues table.TableValues, targetName string) string { + data := [][]float64{} + datasetNames := []string{} + for _, field := range tableValues.Fields[1:] { + points := []float64{} + for _, val := range field.Values { + if val == "" { + break + } + stat, err := strconv.ParseFloat(val, 64) + if err != nil { + slog.Error("error parsing stat", slog.String("error", err.Error())) + return "" + } + points = append(points, stat) + } + if len(points) > 0 { + data = append(data, points) + datasetNames = append(datasetNames, field.Name) + } + } + chartConfig := report.ChartTemplateStruct{ + ID: fmt.Sprintf("%s%d", tableValues.Name, util.RandUint(10000)), + XaxisText: "Time", + YaxisText: "count per second", + TitleText: "", + DisplayTitle: "false", + DisplayLegend: "true", + AspectRatio: "2", + SuggestedMin: "0", + SuggestedMax: "0", + } + return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig, nil) +} diff --git a/cmd/telemetry/telemetry_tables.go b/cmd/telemetry/telemetry_tables.go index ec9209b7..fe245ab8 100644 --- a/cmd/telemetry/telemetry_tables.go +++ b/cmd/telemetry/telemetry_tables.go @@ -33,6 +33,8 @@ const ( TemperatureTelemetryTableName = "Temperature Telemetry" GaudiTelemetryTableName = "Gaudi Telemetry" PDUTelemetryTableName = "PDU Telemetry" + VirtualMemoryTelemetryTableName = "Virtual Memory Telemetry" + ProcessTelemetryTableName = "Process Telemetry" ) // telemetry table menu labels @@ -51,6 +53,8 @@ const ( TemperatureTelemetryMenuLabel = "Temperature" GaudiTelemetryMenuLabel = "Gaudi" PDUTelemetryMenuLabel = "PDU" + VirtualMemoryTelemetryMenuLabel = "Virtual Memory" + ProcessTelemetryMenuLabel = "Process" ) var tableDefinitions = map[string]table.TableDefinition{ @@ -177,6 +181,22 @@ var tableDefinitions = map[string]table.TableDefinition{ script.PDUTelemetryScriptName, }, FieldsFunc: pduTelemetryTableValues}, + VirtualMemoryTelemetryTableName: { + Name: VirtualMemoryTelemetryTableName, + MenuLabel: VirtualMemoryTelemetryMenuLabel, + HasRows: true, + ScriptNames: []string{ + script.KernelTelemetryScriptName, + }, + FieldsFunc: virtualMemoryTelemetryTableValues}, + ProcessTelemetryTableName: { + Name: ProcessTelemetryTableName, + MenuLabel: ProcessTelemetryMenuLabel, + HasRows: true, + ScriptNames: []string{ + script.KernelTelemetryScriptName, + }, + FieldsFunc: processTelemetryTableValues}, } func cpuUtilizationTelemetryTableValues(outputs map[string]script.ScriptOutput) []table.Field { @@ -707,3 +727,87 @@ func instructionTelemetryTableValues(outputs map[string]script.ScriptOutput) []t } return fields } + +func virtualMemoryTelemetryTableValues(outputs map[string]script.ScriptOutput) []table.Field { + fields := []table.Field{ + {Name: "Time"}, + {Name: "Minor Faults/s"}, + {Name: "Major Faults/s"}, + {Name: "Pgscan/s"}, + {Name: "Pgsteal/s"}, + {Name: "Swapin/s"}, + {Name: "Swapout/s"}, + } + // the output is in CSV format: + // timestamp,ctx_switches_per_sec,procs_running,procs_blocked,minor_faults_per_sec,major_faults_per_sec,pgscan_per_sec,pgsteal_per_sec,swapin_per_sec,swapout_per_sec + reader := csv.NewReader(strings.NewReader(outputs[script.KernelTelemetryScriptName].Stdout)) + records, err := reader.ReadAll() + if err != nil { + slog.Error("failed to read virtual memory telemetry CSV output", slog.String("error", err.Error())) + return []table.Field{} + } + if len(records) == 0 { + return []table.Field{} + } + // first row is the header, find the indices of the fields we're interested in + header := records[0] + fieldIndices := make(map[string]int) + for i, fieldName := range header { + fieldIndices[fieldName] = i + } + requiredFields := []string{"timestamp", "minor_faults_per_sec", "major_faults_per_sec", "pgscan_per_sec", "pgsteal_per_sec", "swapin_per_sec", "swapout_per_sec"} + for _, field := range requiredFields { + if _, ok := fieldIndices[field]; !ok { + slog.Error("missing expected field in virtual memory telemetry output", slog.String("field", field)) + return []table.Field{} + } + } + // subsequent rows are data + for _, record := range records[1:] { + fields[0].Values = append(fields[0].Values, record[fieldIndices["timestamp"]]) + fields[1].Values = append(fields[1].Values, record[fieldIndices["minor_faults_per_sec"]]) + fields[2].Values = append(fields[2].Values, record[fieldIndices["major_faults_per_sec"]]) + fields[3].Values = append(fields[3].Values, record[fieldIndices["pgscan_per_sec"]]) + fields[4].Values = append(fields[4].Values, record[fieldIndices["pgsteal_per_sec"]]) + fields[5].Values = append(fields[5].Values, record[fieldIndices["swapin_per_sec"]]) + fields[6].Values = append(fields[6].Values, record[fieldIndices["swapout_per_sec"]]) + } + return fields +} + +func processTelemetryTableValues(outputs map[string]script.ScriptOutput) []table.Field { + fields := []table.Field{ + {Name: "Time"}, + {Name: "Context Switches/s"}, + } + // the output is in CSV format: + // timestamp,ctx_switches_per_sec,procs_running,procs_blocked,minor_faults_per_sec,major_faults_per_sec,pgscan_per_sec,pgsteal_per_sec,swapin_per_sec,swapout_per_sec + reader := csv.NewReader(strings.NewReader(outputs[script.KernelTelemetryScriptName].Stdout)) + records, err := reader.ReadAll() + if err != nil { + slog.Error("failed to read process telemetry CSV output", slog.String("error", err.Error())) + return []table.Field{} + } + if len(records) == 0 { + return []table.Field{} + } + // first row is the header, find the indices of the fields we're interested in + header := records[0] + fieldIndices := make(map[string]int) + for i, fieldName := range header { + fieldIndices[fieldName] = i + } + requiredFields := []string{"timestamp", "ctx_switches_per_sec"} + for _, field := range requiredFields { + if _, ok := fieldIndices[field]; !ok { + slog.Error("missing expected field in process telemetry output", slog.String("field", field)) + return []table.Field{} + } + } + // subsequent rows are data + for _, record := range records[1:] { + fields[0].Values = append(fields[0].Values, record[fieldIndices["timestamp"]]) + fields[1].Values = append(fields[1].Values, record[fieldIndices["ctx_switches_per_sec"]]) + } + return fields +} diff --git a/internal/script/scripts.go b/internal/script/scripts.go index c2ee17e9..42df27cd 100644 --- a/internal/script/scripts.go +++ b/internal/script/scripts.go @@ -119,6 +119,7 @@ const ( InstructionTelemetryScriptName = "instruction telemetry" GaudiTelemetryScriptName = "gaudi telemetry" PDUTelemetryScriptName = "pdu telemetry" + KernelTelemetryScriptName = "kernel telemetry" // flamegraph scripts FlameGraphScriptName = "flamegraph" // lock scripts @@ -1444,7 +1445,7 @@ done `, Superuser: false, }, - { + KernelTelemetryScriptName: { Name: KernelTelemetryScriptName, ScriptTemplate: `interval={{.Interval}} duration={{.Duration}} From b764ade52079fccb1a5f7e9f0f589dee50bf95f5 Mon Sep 17 00:00:00 2001 From: "Harper, Jason M" Date: Mon, 9 Feb 2026 13:50:10 -0800 Subject: [PATCH 3/7] fix: kernel telemetry sample count off-by-one Move the duration exit check to after output but before sleep, so the final sample is emitted before exiting. This ensures the sample count matches other telemetry scripts (e.g., duration=10 with interval=2 now produces 5 samples instead of 4). Co-Authored-By: Claude Opus 4.5 --- internal/script/scripts.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/script/scripts.go b/internal/script/scripts.go index 42df27cd..67db97ce 100644 --- a/internal/script/scripts.go +++ b/internal/script/scripts.go @@ -1477,8 +1477,6 @@ read_vm() { while true; do NOW=$(date +%s) - [[ "$duration" -ne 0 && $((NOW - START_TIME)) -ge "$duration" ]] && exit 0 - STAT_NOW="$(read_stat)" VM_NOW="$(read_vm)" @@ -1522,6 +1520,8 @@ $swapin,$swapout" PREV_STAT="$STAT_NOW" PREV_VM="$VM_NOW" + [[ "$duration" -ne 0 && $((NOW - START_TIME)) -ge "$duration" ]] && exit 0 + sleep "$interval" done `, From 4b541b62ceee2c946681b6ab728d85b802976951 Mon Sep 17 00:00:00 2001 From: "Harper, Jason M" Date: Mon, 9 Feb 2026 13:51:59 -0800 Subject: [PATCH 4/7] fix: use hh:mm:ss timestamp format in kernel telemetry Change kernel telemetry output to use human-readable timestamp format (hh:mm:ss) consistent with other telemetry tables, while keeping Unix timestamp for duration calculations. Co-Authored-By: Claude Opus 4.5 --- internal/script/scripts.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/script/scripts.go b/internal/script/scripts.go index 67db97ce..31a32024 100644 --- a/internal/script/scripts.go +++ b/internal/script/scripts.go @@ -1476,6 +1476,7 @@ read_vm() { while true; do NOW=$(date +%s) + TIMESTAMP=$(date +%H:%M:%S) STAT_NOW="$(read_stat)" VM_NOW="$(read_vm)" @@ -1512,7 +1513,7 @@ while true; do swapin=$(get_vm_delta pswpin) swapout=$(get_vm_delta pswpout) - echo "$NOW,$ctx_rate,$pr_run,$pr_blk,\ + echo "$TIMESTAMP,$ctx_rate,$pr_run,$pr_blk,\ $minflt,$majflt,$pgscan,$pgsteal,\ $swapin,$swapout" fi From 722341f4f545c9aa6227a5d61288c603ac2fd844 Mon Sep 17 00:00:00 2001 From: "Harper, Jason M" Date: Mon, 9 Feb 2026 14:09:27 -0800 Subject: [PATCH 5/7] feature: add kernel metrics to telemetry summary Add minor page faults/s, major page faults/s, and context switches/s to the telemetry summary table. Co-Authored-By: Claude Opus 4.5 --- cmd/telemetry/telemetry.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmd/telemetry/telemetry.go b/cmd/telemetry/telemetry.go index 3ac107f4..f2736647 100644 --- a/cmd/telemetry/telemetry.go +++ b/cmd/telemetry/telemetry.go @@ -371,6 +371,9 @@ func summaryFromTableValues(allTableValues []table.TableValues, _ map[string]scr networkReads := getMetricAverage(getTableValues(allTableValues, NetworkTelemetryTableName), []string{"rxkB/s"}, "Time") networkWrites := getMetricAverage(getTableValues(allTableValues, NetworkTelemetryTableName), []string{"txkB/s"}, "Time") memAvail := getMetricAverage(getTableValues(allTableValues, MemoryTelemetryTableName), []string{"avail"}, "Time") + minorFaults := getMetricAverage(getTableValues(allTableValues, VirtualMemoryTelemetryTableName), []string{"Minor Faults/s"}, "Time") + majorFaults := getMetricAverage(getTableValues(allTableValues, VirtualMemoryTelemetryTableName), []string{"Major Faults/s"}, "Time") + ctxSwitches := getMetricAverage(getTableValues(allTableValues, ProcessTelemetryTableName), []string{"Context Switches/s"}, "Time") return table.TableValues{ TableDefinition: table.TableDefinition{ Name: telemetrySummaryTableName, @@ -389,6 +392,9 @@ func summaryFromTableValues(allTableValues []table.TableValues, _ map[string]scr {Name: "Drive Writes (kB/s)", Values: []string{driveWrites}}, {Name: "Network RX (kB/s)", Values: []string{networkReads}}, {Name: "Network TX (kB/s)", Values: []string{networkWrites}}, + {Name: "Minor Page Faults/s", Values: []string{minorFaults}}, + {Name: "Major Page Faults/s", Values: []string{majorFaults}}, + {Name: "Context Switches/s", Values: []string{ctxSwitches}}, }, } } From c50a53b7b481ab4708bdb68023fbacec67464cdd Mon Sep 17 00:00:00 2001 From: "Harper, Jason M" Date: Mon, 9 Feb 2026 14:36:27 -0800 Subject: [PATCH 6/7] make sure number of fields in record is same as number of fields in header Signed-off-by: Harper, Jason M --- cmd/telemetry/telemetry_tables.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmd/telemetry/telemetry_tables.go b/cmd/telemetry/telemetry_tables.go index fe245ab8..d1e43cac 100644 --- a/cmd/telemetry/telemetry_tables.go +++ b/cmd/telemetry/telemetry_tables.go @@ -764,6 +764,10 @@ func virtualMemoryTelemetryTableValues(outputs map[string]script.ScriptOutput) [ } // subsequent rows are data for _, record := range records[1:] { + if len(record) != len(header) { + slog.Error("unexpected number of fields in virtual memory telemetry output", slog.Int("expected", len(header)), slog.Int("got", len(record))) + continue + } fields[0].Values = append(fields[0].Values, record[fieldIndices["timestamp"]]) fields[1].Values = append(fields[1].Values, record[fieldIndices["minor_faults_per_sec"]]) fields[2].Values = append(fields[2].Values, record[fieldIndices["major_faults_per_sec"]]) @@ -806,6 +810,10 @@ func processTelemetryTableValues(outputs map[string]script.ScriptOutput) []table } // subsequent rows are data for _, record := range records[1:] { + if len(record) != len(header) { + slog.Error("unexpected number of fields in process telemetry output", slog.Int("expected", len(header)), slog.Int("got", len(record))) + continue + } fields[0].Values = append(fields[0].Values, record[fieldIndices["timestamp"]]) fields[1].Values = append(fields[1].Values, record[fieldIndices["ctx_switches_per_sec"]]) } From 5cddbfe859b5a441b0673481136759467228d080 Mon Sep 17 00:00:00 2001 From: "Harper, Jason M" Date: Mon, 9 Feb 2026 14:56:06 -0800 Subject: [PATCH 7/7] fix: increase signal handler timeout to 20 seconds The 10 second timeout was insufficient for graceful shutdown when kernel telemetry scripts need time to complete their cleanup handlers. This caused intermittent exit code 255 errors with empty output. Co-Authored-By: Claude Opus 4.5 --- internal/workflow/signals.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/workflow/signals.go b/internal/workflow/signals.go index ac6cc1a3..6f9e8824 100644 --- a/internal/workflow/signals.go +++ b/internal/workflow/signals.go @@ -107,7 +107,7 @@ func configureSignalHandler(myTargets []target.Target, statusFunc progress.Multi go func(tgt target.Target, pid string) { defer wg.Done() // create a per-target timeout context - targetTimeout := 10 * time.Second + targetTimeout := 20 * time.Second ctx, cancel := context.WithTimeout(context.Background(), targetTimeout) defer cancel() timedOut := false