Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
telemetry2: thread_info: Add telemetry2 thread info module
Adds code for collecting thread statistic for cpu and stack usage, and
for writing that information to telemetry2 memory chuck.

Signed-off-by: Jyri Sarha <jyri.sarha@linux.intel.com>
  • Loading branch information
Jyri Sarha committed May 24, 2024
commit 0acff8674cdc29b90c834be607ac7f9e65a6a94e
4 changes: 4 additions & 0 deletions src/debug/telemetry/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# SPDX-License-Identifier: BSD-3-Clause

add_local_sources_ifdef(CONFIG_SOF_TELEMETRY sof telemetry.c)

add_local_sources_ifdef(CONFIG_SOF_TELEMETRY2_SLOT sof telemetry2_slot.c)

add_local_sources_ifdef(CONFIG_SOF_TELEMETRY2_THREAD_INFO sof telemetry2_thread_info.c)
23 changes: 23 additions & 0 deletions src/debug/telemetry/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,27 @@ config SOF_TELEMETRY2_SLOT_NUMBER
Which debug slot to reserve for telemetry2. Remember to map
the slot with MEMORY_WIN_2_SIZE in soc/xtensa/intel_adsp.

config SOF_TELEMETRY2_THREAD_INFO
bool "Enable thread info chunk in telemetry2 slot"
select INIT_STACKS
select THREAD_MONITOR
select THREAD_STACK_INFO
select THREAD_RUNTIME_STATS
help
This activates Zephyr thread info in telemetry2 slot. For
the thread names to be any more than hex numbers its suggested
to select THREAD_NAME=y too.

if SOF_TELEMETRY2_THREAD_INFO

config SOF_TELEMETRY2_THREAD_INFO_INTERVAL
int "Thread information collection interval in seconds"
default 2
range 1 10
help
Decides how often thread info runs and checks execution cycle
statistics and stack usage.

endif

endif
361 changes: 361 additions & 0 deletions src/debug/telemetry/telemetry2_thread_info.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,361 @@
// SPDX-License-Identifier: BSD-3-Clause
//
// Copyright(c) 2024 Intel Corporation.

#include <zephyr/logging/log.h>
#include <zephyr/kernel.h>
#include <stdio.h>
#include <soc.h>
#include <adsp_debug_window.h>

#include "telemetry2_slot.h"

LOG_MODULE_REGISTER(thread_info);

#define THREAD_INFO_MAX_THREADS 16

struct thread_info {
char name[14];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where does the 14 come from?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a nice number to get 16-byte thread records. The rest of the thread name is truncated. Actually there are no names available unless CONFIG_THREAD_NAME=y . That should maybe be selected by SOF_TELEMETRY2_THREAD_INFO option.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a Zephyr define for this ? - can we reuse it ?

Copy link
Contributor Author

@jsarha jsarha May 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a Zephyr define for this ? - can we reuse it ?

I could not find a single define setting max length for thread name. Instead there are two local ones, for presenting the name in:
subsys/net/lib/shell/sockets.c:#define THREAD_NAME_LEN 23
subsys/tracing/sysview/sysview_config.c:#define THREAD_NAME_LEN 20

I wonder if there is any maximum length. Its just a string that is not used in the Zephyr core code for anything, its there just for human observer. 14 makes the whole packed struct nicely 16 byte aligned, but I can of course make it what ever we want it to be. If the name is longer, the rest is truncated from Thread info output.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good find - @teburd fyi some mismatching lengths for strings.
@jsarha I assume this is just a const char * in thread.h ? if so, you could do the same here ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, its actually a Kconfig variable THREAD_MAX_NAME_LEN, and defaults to 32. However, with the static data structure I did not want to waste memory for maximum thread name length, when 14 is usually enough. I think that other name length definitions are there for similar reasons. In any case, the in the new circular buffer based approach I intend to make the name length dynamic, not static.

uint8_t stack_usage; /* Relative stack usage U(0,8) fixed point value */
uint8_t cpu_usage; /* Relative cpu usage U(0,8) fixed point value */
} __packed;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aligned ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need. The records are in a table that is core specific.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, but wont that need cache line alignment ?

Copy link
Contributor Author

@jsarha jsarha May 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, but wont that need cache line alignment ?

The table should cache aligned, but not single element, holding information for just one thread. One cache line can hold info for 4 threads. Currently the code reserves table of 16 struct thread_info for each core.


#define THREAD_INFO_VERSION_0_0 0

#define THREAD_INFO_STATE_UNINITIALIZED 0
#define THREAD_INFO_STATE_BEING_UPDATED 1
#define THREAD_INFO_STATE_UPTODATE 2

/* Core specific data, updated each round, including the thread table. */
struct thread_info_core {
uint8_t state; /* indicates if core data is in consistent state */
uint8_t counter; /* incremented every round */
uint8_t load; /* Core's load U(0,8) fixed point value */
uint8_t thread_count;
struct thread_info thread[THREAD_INFO_MAX_THREADS];
} __packed __aligned(CONFIG_DCACHE_LINE_SIZE);

/* Thread info telemetry2 chunk header. Only initialized at first
* thread info thread start. Should be used to find the core specific
* sections where the thread info is when decoding.
*/
struct thread_info_chunk {
struct telemetry2_chunk_hdr hdr;
uint16_t core_count;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a thread can run on multiple cores?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the header of the whole thread info chunk, and the number indicates how many cores we have. The code keeps all the core separate all the time. Nothing else seems to work (DSP hangs). So there is a thread running on each core reporting the threads on that core. This header is just written once in the beginning and is used by the user-space client to find the core specific sections. I'll add some more comments.

uint16_t core_offset[CONFIG_MP_MAX_NUM_CPUS];
struct thread_info_core __aligned(CONFIG_DCACHE_LINE_SIZE) core[CONFIG_MP_MAX_NUM_CPUS];
} __packed;

#ifdef CONFIG_THREAD_RUNTIME_STATS
/* Data structure to store the cycle counter values from the previous
* round. The numbers are used to calculate what the load was on this
* round.
*/
static struct previous_counters { // Cached data from previous round
uint64_t active; // All execution cycles
uint64_t all; // All cycles including idle
struct thread_counters {
void *tid; // thread ID (the thread struct ptr)
uint64_t cycles; // cycle counter value
} threads[THREAD_INFO_MAX_THREADS]; // The max amount of threads we follow
} previous[CONFIG_MP_MAX_NUM_CPUS];
#endif

/* Data structure to be passed down to thread_info_cb() by
* k_thread_foreach_current_cpu().
*/
struct user_data {
struct thread_info_core *core_data;
int thread_count;
#ifdef CONFIG_THREAD_RUNTIME_STATS
bool stats_valid;
uint32_t all_cycles;
void *active_threads[THREAD_INFO_MAX_THREADS];
struct previous_counters *previous;
#endif
};

#ifdef CONFIG_THREAD_RUNTIME_STATS

/* Finds and/or updates cached cycle counter value for 'tid'-thread
* and calculates used execution cycles since previous round and
* returns it.
*/

static uint32_t get_cycles(void *tid, k_thread_runtime_stats_t *thread_stats,
struct user_data *ud, const char *name)

{
int i;

ud->active_threads[ud->thread_count] = tid; // Mark the thread as active
// look for cached value from previous round for 'tid'-thread
for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) {
if (ud->previous->threads[i].tid == tid) {
// Calculate number cycles since previous round
uint32_t cycles = (uint32_t) (thread_stats->execution_cycles -
ud->previous->threads[i].cycles);

LOG_DBG("%p found at %d (%s %llu)", tid, i,
name, thread_stats->execution_cycles);
// updare cached value
ud->previous->threads[i].cycles = thread_stats->execution_cycles;
return cycles;
}
}

// If no cached value was found, look for an empty slot to store the recent value
for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) {
if (ud->previous->threads[i].tid == NULL) {
ud->previous->threads[i].tid = tid;
ud->previous->threads[i].cycles = thread_stats->execution_cycles;
LOG_DBG("%p placed at %d (%s %llu)", tid, i,
name, ud->previous->threads[i].cycles);
break;
}
}

// If there is more than THREAD_INFO_MAX_THREADS threads in the system
if (i == ARRAY_SIZE(ud->previous->threads))
LOG_INF("No place found for %s %p", name, tid);

// If there was no previous counter value to compare, return 0 cycles.
return 0;
}

static uint8_t thread_info_cpu_utilization(struct k_thread *thread,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like some of this file is RTOS specific and should go in Zephyr as Zephyr APIs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its just refining the Zephyr thread stats cycle-counter values to produce load information. It requires cached values from previous round, and a generic Zephyr size implementation would become quite complex, requiring caller specific context allocation etc. If there is time for it, I can do it of course, but AFAIK we do not have it ATM.

struct user_data *ud, const char *name)
{
k_thread_runtime_stats_t thread_stats;
uint32_t cycles;
int ret;

if (!ud->stats_valid)
return 0;

if (k_thread_runtime_stats_get(thread, &thread_stats) != 0)
return 0;

cycles = get_cycles(thread, &thread_stats, ud, name);

LOG_DBG("thread %s %u / %u", name, cycles, ud->all_cycles);

return (uint8_t) ((255llu * cycles) / ud->all_cycles);
}
#else
static uint8_t thread_info_cpu_utilization(struct k_thread *thread,
struct user_data *ud)
{
return 0;
}
#endif

#ifdef CONFIG_THREAD_STACK_INFO
static uint8_t thread_info_stack_level(struct k_thread *thread, const char *name)
{
size_t stack_size, stack_unused;
int ret;

stack_size = thread->stack_info.size;
ret = k_thread_stack_space_get(thread, &stack_unused);
if (ret) {
LOG_ERR(" %-20s: unable to get stack space (%d)",
name, ret);
stack_unused = 0;
}
return (UINT8_MAX * (stack_size - stack_unused)) / stack_size;
}
#else
static uint8_t thread_info_stack_level(struct k_thread *thread, const char *name)
{
return 0;
}
#endif

static void thread_info_cb(const struct k_thread *cthread, void *user_data)
{
struct k_thread *thread = (struct k_thread *)cthread;
struct user_data *ud = user_data;
struct thread_info *thread_info;
const char *name;

if (ud->thread_count > ARRAY_SIZE(ud->core_data->thread)) {
LOG_ERR("Thread count %u exceeds the memory window size\n",
ud->thread_count++);
return;
}

thread_info = &ud->core_data->thread[ud->thread_count];

name = k_thread_name_get((k_tid_t)thread);
if (!name || name[0] == '\0') {
snprintk(thread_info->name, sizeof(thread_info->name), "%p",
(void *)thread);
} else {
strncpy(thread_info->name, name, sizeof(thread_info->name));
}

thread_info->stack_usage = thread_info_stack_level(thread, name);
thread_info->cpu_usage = thread_info_cpu_utilization(thread, ud, name);

LOG_DBG("core %u %s stack %u%% cpu %u%%", ud->core,
thread_info->name, thread_info->stack_usage * 100U / 255,
thread_info->cpu_usage * 100U / 255);

ud->thread_count++;
}

#ifdef CONFIG_THREAD_RUNTIME_STATS

/* Marks cached thread cycle counter entries of removed threads
* free. This also happens to threads pinned to other cpu than the
* primary. In the beginning they are listed on primary cpu, until the
* pinned cpu is started up and the thread is executed for the fist
* time and it moves to the cpu its pinned on.
*/
static void cleanup_old_thread_cycles(struct user_data *ud)
{
int i, j;

for (i = 0; i < ARRAY_SIZE(ud->previous->threads); i++) {
bool found = false;

// This entry is already free, continue
if (ud->previous->threads[i].tid == NULL)
continue;

// Check if the thread is any more active
for (j = 0; j < ud->thread_count; j++) {
if (ud->active_threads[j] == ud->previous->threads[i].tid) {
found = true;
break;
}
}
// If the thead is not any more active, mark the entry free
if (!found) {
ud->previous->threads[i].tid = NULL;
ud->previous->threads[i].cycles = 0;
}
}
}
#else
static void cleanup_old_thread_cycles(struct user_data *ud) { }
#endif

static void thread_info_get(struct thread_info_core *core_data)
{
k_thread_runtime_stats_t core_stats;
struct user_data ud = {
.core_data = core_data,
.thread_count = 0,
#ifdef CONFIG_THREAD_RUNTIME_STATS
.previous = &previous[arch_curr_cpu()->id],
.active_threads = { NULL },
#endif
};
uint8_t load = 0;
#ifdef CONFIG_THREAD_RUNTIME_STATS
int ret = k_thread_runtime_stats_current_cpu_get(&core_stats);

if (ret == 0) {
uint32_t active_cycles = (uint32_t) (core_stats.total_cycles -
ud.previous->active);
uint32_t all_cycles = (uint32_t) (core_stats.execution_cycles -
ud.previous->all);

ud.stats_valid = true;
load = (uint8_t) ((255LLU * active_cycles) / all_cycles);
LOG_DBG("Core %u load %u / %u total %llu / %llu", arch_curr_cpu()->id,
active_cycles, all_cycles,
core_stats.total_cycles, core_stats.execution_cycles);
ud.previous->active = core_stats.total_cycles;
ud.previous->all = core_stats.execution_cycles;
ud.all_cycles = all_cycles;
}
#endif
core_data->state = THREAD_INFO_STATE_BEING_UPDATED;

core_data->load = load;
k_thread_foreach_current_cpu(thread_info_cb, &ud);

cleanup_old_thread_cycles(&ud);

core_data->counter++;
core_data->thread_count = ud.thread_count;
core_data->state = THREAD_INFO_STATE_UPTODATE;
}

static void thread_info_run(void *data, void *cnum, void *a)
{
struct thread_info_chunk *chunk = data;
int core = (int) cnum;
struct thread_info_core *core_data = &chunk->core[core];

for (;;) {
thread_info_get(core_data);
k_sleep(K_SECONDS(CONFIG_SOF_TELEMETRY2_THREAD_INFO_INTERVAL));
}
}

static struct thread_info_chunk *thread_info_init(void)
{
struct thread_info_chunk *chunk;
int i;

chunk = telemetry2_chunk_get(TELEMETRY2_ID_THREAD_INFO,
sizeof(*chunk));
if (!chunk)
return NULL;

chunk->core_count = CONFIG_MP_MAX_NUM_CPUS;
for (i = 0; i < CONFIG_MP_MAX_NUM_CPUS; i++)
chunk->core_offset[i] = offsetof(struct thread_info_chunk, core[i]);

return chunk;
}

static K_THREAD_STACK_ARRAY_DEFINE(info_thread_stacks, CONFIG_MP_MAX_NUM_CPUS,
1024);
static struct k_thread info_thread[CONFIG_MP_MAX_NUM_CPUS];

static int thread_info_start(void)
{
struct thread_info_chunk *chunk = thread_info_init();
uint32_t i;

if (!chunk)
return 0;

for (i = 0; i < ARRAY_SIZE(info_thread); i++) {
char name[24];
k_tid_t tid = NULL;
int ret;

tid = k_thread_create(&info_thread[i], info_thread_stacks[i],
1024,
thread_info_run,
chunk, (void *) i, NULL,
K_LOWEST_APPLICATION_THREAD_PRIO, 0, K_FOREVER);
if (!tid) {
LOG_ERR("k_thread_create() failed for core %u", i);
continue;
}
ret = k_thread_cpu_pin(tid, i);
if (ret < 0) {
LOG_ERR("Pinning thread to code core %u", i);
k_thread_abort(tid);
continue;
}
snprintf(name, sizeof(name), "%u thread info", i);
ret = k_thread_name_set(tid, name);
if (ret < 0)
LOG_INF("k_thread_name_set failed: %d for %u", ret, i);

k_thread_start(tid);
LOG_DBG("Thread %p for core %u started", tid, i);
}

return 0;
}

SYS_INIT(thread_info_start, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT);