From 1801d5590e614fdb2dc51a8b609e8ca1b1fe3997 Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Thu, 4 Dec 2025 20:18:23 +0800 Subject: [PATCH 1/4] [Docs] Improve static shape tuning parameter configuration (follow-up to commit c71aefc) - Add max_trials_per_task parameter to static_shape_tuning_pipeline - Adjust default TOTAL_TRIALS from 8000 to 80 for tutorial demonstration purposes - Add documentation for tuning parameters in tutorial, clarifying relationship between MAX_TRIALS_PER_TASK and TOTAL_TRIALS --- docs/how_to/tutorials/e2e_opt_model.py | 29 ++++++++++++++++++++++++-- python/tvm/relax/pipeline.py | 21 ++++++++++++++++--- 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/docs/how_to/tutorials/e2e_opt_model.py b/docs/how_to/tutorials/e2e_opt_model.py index 8307ddc4f299..81007609eccb 100644 --- a/docs/how_to/tutorials/e2e_opt_model.py +++ b/docs/how_to/tutorials/e2e_opt_model.py @@ -95,13 +95,38 @@ # leverage MetaSchedule to tune the model and store the tuning logs to the database. We also # apply the database to the model to get the best performance. # +# The ResNet18 model will be divided into 20 independent tuning tasks during compilation. +# To ensure each task receives adequate tuning resources in one iteration while providing +# early feedback: +# +# - To quickly observe tuning progress, each task is allocated a maximum of 4 trials per +# iteration (controlled by ``MAX_TRIALS_PER_TASK=4``). Setting ``TOTAL_TRIALS`` to at least +# ``80 (20 tasks * 4 trials)`` ensures every task receives one full iteration of tuning. +# - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``min(max_trials_per_iter=64, +# TOTAL_TRIALS)`` trials per task per iteration. This may lead to undersubscribed tuning when +# ``TOTAL_TRIALS`` is insufficient (e.g., ``64 < TOTAL_TRIALS < 20 * 64``), potentially skipping +# some tasks entirely, leaving critical operators unoptimized or missing thread binding for +# untuned tasks. Explicitly setting both parameters avoids this issue and provides deterministic +# resource allocation across all tasks. +# +# Note: These parameter settings are optimized for quick tutorial demonstration. For production +# deployments requiring higher performance, we recommend adjusting both MAX_TRIALS_PER_TASK +# and TOTAL_TRIALS to larger values. This allows more extensive search space exploration +# and typically yields better performance outcomes. -TOTAL_TRIALS = 8000 # Change to 20000 for better performance if needed +TOTAL_TRIALS = 80 # Change to 20000 for better performance if needed +MAX_TRIALS_PER_TASK = 4 # Change to more trials per task for better performance if needed target = tvm.target.Target("nvidia/geforce-rtx-3090-ti") # Change to your target device work_dir = "tuning_logs" if not IS_IN_CI: - mod = relax.get_pipeline("static_shape_tuning", target=target, total_trials=TOTAL_TRIALS)(mod) + mod = relax.get_pipeline( + "static_shape_tuning", + target=target, + work_dir=work_dir, + total_trials=TOTAL_TRIALS, + max_trials_per_task=MAX_TRIALS_PER_TASK, + )(mod) # Only show the main function mod["main"].show() diff --git a/python/tvm/relax/pipeline.py b/python/tvm/relax/pipeline.py index a5850267a8c4..056c647a03f0 100644 --- a/python/tvm/relax/pipeline.py +++ b/python/tvm/relax/pipeline.py @@ -21,7 +21,7 @@ as it is or serves as a basis to do further composition. """ # pylint: disable=unused-argument -from typing import Union +from typing import Union, Optional import tvm from tvm import meta_schedule as ms @@ -111,6 +111,7 @@ def static_shape_tuning_pipeline( target: Union[str, tvm.target.Target], work_dir: str = "tuning_logs", cpu_weight_prepack: bool = False, + max_trials_per_task: Optional[int] = None, ): """Tune the static shape model and store the log to database. @@ -128,6 +129,14 @@ def static_shape_tuning_pipeline( cpu_weight_prepack : bool Whether to enable the cpu weight prepack feature. + max_trials_per_task : Optional[int] + The maximum number of trials to run per task. + If not specified, MetaSchedule will use a default value of 64 + trials per task during the tuning process. + For optimal tuning, set `total_trials` to at least + `max_trials_per_task * number_of_tuning_tasks` to ensure + each task receives adequate tuning resources in one iteration. + Note ---- `cpu_weight_prepack` is expected to be `True` when running on CPU for @@ -142,6 +151,7 @@ def static_shape_tuning_pipeline( target="llvm -num-cores 16", work_dir="tuning_logs", cpu_weight_prepack=True, + max_trials_per_task=64, )(mod) ex = tvm.compile(mod, target=target) @@ -177,8 +187,13 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I *pre_tuning_layout_rewrite, # Skip tuning if total_trials is 0 ( - transform.MetaScheduleTuneIRMod({}, work_dir, total_trials) - if total_trials > 0 + transform.MetaScheduleTuneIRMod( + params={}, + work_dir=work_dir, + max_trials_global=total_trials, + max_trials_per_task=max_trials_per_task, + ) + if total_trials > 0 and max_trials_per_task > 0 else tvm.transform.Sequential([]) ), transform.MetaScheduleApplyDatabase(work_dir), From 96aad819d684e4cf7cb7bf2214f387a224283aa5 Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Thu, 4 Dec 2025 20:25:32 +0800 Subject: [PATCH 2/4] fix --- python/tvm/relax/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/relax/pipeline.py b/python/tvm/relax/pipeline.py index 056c647a03f0..e1a18513e335 100644 --- a/python/tvm/relax/pipeline.py +++ b/python/tvm/relax/pipeline.py @@ -193,7 +193,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I max_trials_global=total_trials, max_trials_per_task=max_trials_per_task, ) - if total_trials > 0 and max_trials_per_task > 0 + if total_trials > 0 else tvm.transform.Sequential([]) ), transform.MetaScheduleApplyDatabase(work_dir), From 85ddec6458adcf9b230bd58b57392b2a916423eb Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Fri, 5 Dec 2025 16:58:02 +0800 Subject: [PATCH 3/4] Update trials config --- docs/how_to/tutorials/e2e_opt_model.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/how_to/tutorials/e2e_opt_model.py b/docs/how_to/tutorials/e2e_opt_model.py index 81007609eccb..9e9204e3b2f1 100644 --- a/docs/how_to/tutorials/e2e_opt_model.py +++ b/docs/how_to/tutorials/e2e_opt_model.py @@ -99,9 +99,11 @@ # To ensure each task receives adequate tuning resources in one iteration while providing # early feedback: # -# - To quickly observe tuning progress, each task is allocated a maximum of 4 trials per -# iteration (controlled by ``MAX_TRIALS_PER_TASK=4``). Setting ``TOTAL_TRIALS`` to at least -# ``80 (20 tasks * 4 trials)`` ensures every task receives one full iteration of tuning. +# - To quickly observe tuning progress, each task is allocated a maximum of 16 trials per +# iteration (controlled by ``MAX_TRIALS_PER_TASK=16``). We should set ``TOTAL_TRIALS`` +# to at least ``320 (20 tasks * 16 trials)`` ensures every task receives one full iteration +# of tuning. We set it to 512 in our configuration to allow for several more iterations, +# aiming to explore a wider parameter space and potentially achieve better performance. # - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``min(max_trials_per_iter=64, # TOTAL_TRIALS)`` trials per task per iteration. This may lead to undersubscribed tuning when # ``TOTAL_TRIALS`` is insufficient (e.g., ``64 < TOTAL_TRIALS < 20 * 64``), potentially skipping @@ -114,8 +116,8 @@ # and TOTAL_TRIALS to larger values. This allows more extensive search space exploration # and typically yields better performance outcomes. -TOTAL_TRIALS = 80 # Change to 20000 for better performance if needed -MAX_TRIALS_PER_TASK = 4 # Change to more trials per task for better performance if needed +TOTAL_TRIALS = 512 # Change to 20000 for better performance if needed +MAX_TRIALS_PER_TASK = 16 # Change to more trials per task for better performance if needed target = tvm.target.Target("nvidia/geforce-rtx-3090-ti") # Change to your target device work_dir = "tuning_logs" From 73c9cba08e7a30262451adf7f69cc26220c84e69 Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Fri, 5 Dec 2025 20:01:24 +0800 Subject: [PATCH 4/4] fix doc --- docs/how_to/tutorials/e2e_opt_model.py | 14 ++++++-------- python/tvm/relax/pipeline.py | 6 ++++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/how_to/tutorials/e2e_opt_model.py b/docs/how_to/tutorials/e2e_opt_model.py index 9e9204e3b2f1..507864160d9f 100644 --- a/docs/how_to/tutorials/e2e_opt_model.py +++ b/docs/how_to/tutorials/e2e_opt_model.py @@ -104,16 +104,14 @@ # to at least ``320 (20 tasks * 16 trials)`` ensures every task receives one full iteration # of tuning. We set it to 512 in our configuration to allow for several more iterations, # aiming to explore a wider parameter space and potentially achieve better performance. -# - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``min(max_trials_per_iter=64, -# TOTAL_TRIALS)`` trials per task per iteration. This may lead to undersubscribed tuning when -# ``TOTAL_TRIALS`` is insufficient (e.g., ``64 < TOTAL_TRIALS < 20 * 64``), potentially skipping -# some tasks entirely, leaving critical operators unoptimized or missing thread binding for -# untuned tasks. Explicitly setting both parameters avoids this issue and provides deterministic -# resource allocation across all tasks. +# - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``TOTAL_TRIALS`` trials per +# task per iteration. An insufficient ``TOTAL_TRIALS`` setting may lead to undersubscribed +# tuning, potentially skipping some tasks entirely. Explicitly setting both parameters +# avoids this issue and provides deterministic resource allocation across all tasks. # # Note: These parameter settings are optimized for quick tutorial demonstration. For production -# deployments requiring higher performance, we recommend adjusting both MAX_TRIALS_PER_TASK -# and TOTAL_TRIALS to larger values. This allows more extensive search space exploration +# deployments requiring higher performance, we recommend adjusting both ``MAX_TRIALS_PER_TASK`` +# and ``TOTAL_TRIALS`` to larger values. This allows more extensive search space exploration # and typically yields better performance outcomes. TOTAL_TRIALS = 512 # Change to 20000 for better performance if needed diff --git a/python/tvm/relax/pipeline.py b/python/tvm/relax/pipeline.py index e1a18513e335..388f9dbb43cd 100644 --- a/python/tvm/relax/pipeline.py +++ b/python/tvm/relax/pipeline.py @@ -131,8 +131,10 @@ def static_shape_tuning_pipeline( max_trials_per_task : Optional[int] The maximum number of trials to run per task. - If not specified, MetaSchedule will use a default value of 64 - trials per task during the tuning process. + If not specified, it defaults to the value of `total_trials`, and this + may lead to undersubscribed tuning, potentially skipping some tasks + entirely. Explicitly setting both parameters avoids this issue and + provides deterministic resource allocation across all tasks. For optimal tuning, set `total_trials` to at least `max_trials_per_task * number_of_tuning_tasks` to ensure each task receives adequate tuning resources in one iteration.