Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/sanitizers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ name: Sanitizers
# parallelism-limited subset to dodge the sim-oversubscription livelock; see the
# run step. detect_leaks=0 until LSan suppressions exist for the device arenas.
on:
schedule:
- cron: "0 18 * * *" # 02:00 Beijing
pull_request:
branches: [main]

concurrency:
group: sanitizers-${{ github.ref }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
CYCLE_COUNT_LAP(prof_param_extract);

LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch);

// Reshape tensors for kernel consumption (2D flattened)
void *query_ptr = orch_args.tensor(0).data_as<void>();
void *kc_ptr = orch_args.tensor(1).data_as<void>();
Expand Down Expand Up @@ -251,43 +249,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
CYCLE_COUNT_LAP(prof_scope);
}
}

#ifdef ENABLE_PROFILING
uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
prof_submit_task + prof_scope;
LOG_INFO_V9(
"=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
prof_make_count, prof_view_count, cycles_to_us(total)
);
if (total > 0) {
LOG_INFO_V9(
" param_extract : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
prof_param_extract * 100.0 / total
);
LOG_INFO_V9(
" ext_tensor(x4) : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
);
LOG_INFO_V9(
" create_info(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
prof_make_tensor * 100.0 / total,
prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
);
LOG_INFO_V9(
" tensor_view(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
prof_tensor_view * 100.0 / total,
prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
);
LOG_INFO_V9(
" param_setup : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total
);
LOG_INFO_V9(" scope : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total);
LOG_INFO_V9(
" submit_task(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
prof_submit_task * 100.0 / total,
prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
);
}
#endif
}

} // extern "C"
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,22 @@ class TestPagedAttention(SceneTestCase):
"dtype": "bfloat16",
},
},
{
"name": "Case4",
"platforms": ["a2a3"],
"config": {"aicpu_thread_num": 4, "block_dim": 24},
"manual": True,
"params": {
"batch": 16,
"num_heads": 16,
"kv_head_num": 1,
"head_dim": 16,
"block_size": 16,
"context_len": 256,
"max_model_len": 2048,
"dtype": "bfloat16",
},
},
{
"name": "CaseSmall1",
"platforms": ["a2a3sim", "a2a3"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@

// Tensor dump uses these defaults to size its selective mask table so task-id
// ring/slot lookup stays aligned with PTO2 task id layout.
#ifndef PTO2_TASK_WINDOW_SIZE
#define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2)
#endif
#ifndef PTO2_MAX_RING_DEPTH
#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers
#endif

#endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
Loading