From 1801d5590e614fdb2dc51a8b609e8ca1b1fe3997 Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Thu, 4 Dec 2025 20:18:23 +0800
Subject: [PATCH 1/4] [Docs] Improve static shape tuning parameter
 configuration (follow-up to commit c71aefc)

- Add max_trials_per_task parameter to static_shape_tuning_pipeline
- Adjust default TOTAL_TRIALS from 8000 to 80 for tutorial demonstration purposes
- Add documentation for tuning parameters in tutorial, clarifying relationship between MAX_TRIALS_PER_TASK and TOTAL_TRIALS
---
 docs/how_to/tutorials/e2e_opt_model.py | 29 ++++++++++++++++++++++++--
 python/tvm/relax/pipeline.py           | 21 ++++++++++++++++---
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/docs/how_to/tutorials/e2e_opt_model.py b/docs/how_to/tutorials/e2e_opt_model.py
index 8307ddc4f299..81007609eccb 100644
--- a/docs/how_to/tutorials/e2e_opt_model.py
+++ b/docs/how_to/tutorials/e2e_opt_model.py
@@ -95,13 +95,38 @@
 # leverage MetaSchedule to tune the model and store the tuning logs to the database. We also
 # apply the database to the model to get the best performance.
 #
+# The ResNet18 model will be divided into 20 independent tuning tasks during compilation.
+# To ensure each task receives adequate tuning resources in one iteration while providing
+# early feedback:
+#
+# - To quickly observe tuning progress, each task is allocated a maximum of 4 trials per
+#   iteration (controlled by ``MAX_TRIALS_PER_TASK=4``). Setting ``TOTAL_TRIALS`` to at least
+#   ``80 (20 tasks * 4 trials)`` ensures every task receives one full iteration of tuning.
+# - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``min(max_trials_per_iter=64,
+#   TOTAL_TRIALS)`` trials per task per iteration. This may lead to undersubscribed tuning when
+#   ``TOTAL_TRIALS`` is insufficient (e.g., ``64 < TOTAL_TRIALS < 20 * 64``), potentially skipping
+#   some tasks entirely, leaving critical operators unoptimized or missing thread binding for
+#   untuned tasks. Explicitly setting both parameters avoids this issue and provides deterministic
+#   resource allocation across all tasks.
+#
+# Note: These parameter settings are optimized for quick tutorial demonstration. For production
+# deployments requiring higher performance, we recommend adjusting both MAX_TRIALS_PER_TASK
+# and TOTAL_TRIALS to larger values. This allows more extensive search space exploration
+# and typically yields better performance outcomes.
 
-TOTAL_TRIALS = 8000  # Change to 20000 for better performance if needed
+TOTAL_TRIALS = 80  # Change to 20000 for better performance if needed
+MAX_TRIALS_PER_TASK = 4  # Change to more trials per task for better performance if needed
 target = tvm.target.Target("nvidia/geforce-rtx-3090-ti")  # Change to your target device
 work_dir = "tuning_logs"
 
 if not IS_IN_CI:
-    mod = relax.get_pipeline("static_shape_tuning", target=target, total_trials=TOTAL_TRIALS)(mod)
+    mod = relax.get_pipeline(
+        "static_shape_tuning",
+        target=target,
+        work_dir=work_dir,
+        total_trials=TOTAL_TRIALS,
+        max_trials_per_task=MAX_TRIALS_PER_TASK,
+    )(mod)
 
     # Only show the main function
     mod["main"].show()
diff --git a/python/tvm/relax/pipeline.py b/python/tvm/relax/pipeline.py
index a5850267a8c4..056c647a03f0 100644
--- a/python/tvm/relax/pipeline.py
+++ b/python/tvm/relax/pipeline.py
@@ -21,7 +21,7 @@
 as it is or serves as a basis to do further composition.
 """
 # pylint: disable=unused-argument
-from typing import Union
+from typing import Union, Optional
 
 import tvm
 from tvm import meta_schedule as ms
@@ -111,6 +111,7 @@ def static_shape_tuning_pipeline(
     target: Union[str, tvm.target.Target],
     work_dir: str = "tuning_logs",
     cpu_weight_prepack: bool = False,
+    max_trials_per_task: Optional[int] = None,
 ):
     """Tune the static shape model and store the log to database.
 
@@ -128,6 +129,14 @@ def static_shape_tuning_pipeline(
     cpu_weight_prepack : bool
         Whether to enable the cpu weight prepack feature.
 
+    max_trials_per_task : Optional[int]
+        The maximum number of trials to run per task.
+        If not specified, MetaSchedule will use a default value of 64
+        trials per task during the tuning process.
+        For optimal tuning, set `total_trials` to at least
+        `max_trials_per_task * number_of_tuning_tasks` to ensure
+        each task receives adequate tuning resources in one iteration.
+
     Note
     ----
     `cpu_weight_prepack` is expected to be `True` when running on CPU for
@@ -142,6 +151,7 @@ def static_shape_tuning_pipeline(
             target="llvm -num-cores 16",
             work_dir="tuning_logs",
             cpu_weight_prepack=True,
+            max_trials_per_task=64,
         )(mod)
 
         ex = tvm.compile(mod, target=target)
@@ -177,8 +187,13 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
                     *pre_tuning_layout_rewrite,
                     # Skip tuning if total_trials is 0
                     (
-                        transform.MetaScheduleTuneIRMod({}, work_dir, total_trials)
-                        if total_trials > 0
+                        transform.MetaScheduleTuneIRMod(
+                            params={},
+                            work_dir=work_dir,
+                            max_trials_global=total_trials,
+                            max_trials_per_task=max_trials_per_task,
+                        )
+                        if total_trials > 0 and max_trials_per_task > 0
                         else tvm.transform.Sequential([])
                     ),
                     transform.MetaScheduleApplyDatabase(work_dir),

From 96aad819d684e4cf7cb7bf2214f387a224283aa5 Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Thu, 4 Dec 2025 20:25:32 +0800
Subject: [PATCH 2/4] fix

---
 python/tvm/relax/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relax/pipeline.py b/python/tvm/relax/pipeline.py
index 056c647a03f0..e1a18513e335 100644
--- a/python/tvm/relax/pipeline.py
+++ b/python/tvm/relax/pipeline.py
@@ -193,7 +193,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
                             max_trials_global=total_trials,
                             max_trials_per_task=max_trials_per_task,
                         )
-                        if total_trials > 0 and max_trials_per_task > 0
+                        if total_trials > 0
                         else tvm.transform.Sequential([])
                     ),
                     transform.MetaScheduleApplyDatabase(work_dir),

From 85ddec6458adcf9b230bd58b57392b2a916423eb Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Fri, 5 Dec 2025 16:58:02 +0800
Subject: [PATCH 3/4] Update trials config

---
 docs/how_to/tutorials/e2e_opt_model.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/how_to/tutorials/e2e_opt_model.py b/docs/how_to/tutorials/e2e_opt_model.py
index 81007609eccb..9e9204e3b2f1 100644
--- a/docs/how_to/tutorials/e2e_opt_model.py
+++ b/docs/how_to/tutorials/e2e_opt_model.py
@@ -99,9 +99,11 @@
 # To ensure each task receives adequate tuning resources in one iteration while providing
 # early feedback:
 #
-# - To quickly observe tuning progress, each task is allocated a maximum of 4 trials per
-#   iteration (controlled by ``MAX_TRIALS_PER_TASK=4``). Setting ``TOTAL_TRIALS`` to at least
-#   ``80 (20 tasks * 4 trials)`` ensures every task receives one full iteration of tuning.
+# - To quickly observe tuning progress, each task is allocated a maximum of 16 trials per
+#   iteration (controlled by ``MAX_TRIALS_PER_TASK=16``). We should set ``TOTAL_TRIALS``
+#   to at least ``320 (20 tasks * 16 trials)`` ensures every task receives one full iteration
+#   of tuning. We set it to 512 in our configuration to allow for several more iterations,
+#   aiming to explore a wider parameter space and potentially achieve better performance.
 # - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``min(max_trials_per_iter=64,
 #   TOTAL_TRIALS)`` trials per task per iteration. This may lead to undersubscribed tuning when
 #   ``TOTAL_TRIALS`` is insufficient (e.g., ``64 < TOTAL_TRIALS < 20 * 64``), potentially skipping
@@ -114,8 +116,8 @@
 # and TOTAL_TRIALS to larger values. This allows more extensive search space exploration
 # and typically yields better performance outcomes.
 
-TOTAL_TRIALS = 80  # Change to 20000 for better performance if needed
-MAX_TRIALS_PER_TASK = 4  # Change to more trials per task for better performance if needed
+TOTAL_TRIALS = 512  # Change to 20000 for better performance if needed
+MAX_TRIALS_PER_TASK = 16  # Change to more trials per task for better performance if needed
 target = tvm.target.Target("nvidia/geforce-rtx-3090-ti")  # Change to your target device
 work_dir = "tuning_logs"
 

From 73c9cba08e7a30262451adf7f69cc26220c84e69 Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Fri, 5 Dec 2025 20:01:24 +0800
Subject: [PATCH 4/4] fix doc

---
 docs/how_to/tutorials/e2e_opt_model.py | 14 ++++++--------
 python/tvm/relax/pipeline.py           |  6 ++++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/how_to/tutorials/e2e_opt_model.py b/docs/how_to/tutorials/e2e_opt_model.py
index 9e9204e3b2f1..507864160d9f 100644
--- a/docs/how_to/tutorials/e2e_opt_model.py
+++ b/docs/how_to/tutorials/e2e_opt_model.py
@@ -104,16 +104,14 @@
 #   to at least ``320 (20 tasks * 16 trials)`` ensures every task receives one full iteration
 #   of tuning. We set it to 512 in our configuration to allow for several more iterations,
 #   aiming to explore a wider parameter space and potentially achieve better performance.
-# - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``min(max_trials_per_iter=64,
-#   TOTAL_TRIALS)`` trials per task per iteration. This may lead to undersubscribed tuning when
-#   ``TOTAL_TRIALS`` is insufficient (e.g., ``64 < TOTAL_TRIALS < 20 * 64``), potentially skipping
-#   some tasks entirely, leaving critical operators unoptimized or missing thread binding for
-#   untuned tasks. Explicitly setting both parameters avoids this issue and provides deterministic
-#   resource allocation across all tasks.
+# - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``TOTAL_TRIALS`` trials per
+#   task per iteration. An insufficient ``TOTAL_TRIALS`` setting may lead to undersubscribed
+#   tuning, potentially skipping some tasks entirely. Explicitly setting both parameters
+#   avoids this issue and provides deterministic resource allocation across all tasks.
 #
 # Note: These parameter settings are optimized for quick tutorial demonstration. For production
-# deployments requiring higher performance, we recommend adjusting both MAX_TRIALS_PER_TASK
-# and TOTAL_TRIALS to larger values. This allows more extensive search space exploration
+# deployments requiring higher performance, we recommend adjusting both ``MAX_TRIALS_PER_TASK``
+# and ``TOTAL_TRIALS`` to larger values. This allows more extensive search space exploration
 # and typically yields better performance outcomes.
 
 TOTAL_TRIALS = 512  # Change to 20000 for better performance if needed
diff --git a/python/tvm/relax/pipeline.py b/python/tvm/relax/pipeline.py
index e1a18513e335..388f9dbb43cd 100644
--- a/python/tvm/relax/pipeline.py
+++ b/python/tvm/relax/pipeline.py
@@ -131,8 +131,10 @@ def static_shape_tuning_pipeline(
 
     max_trials_per_task : Optional[int]
         The maximum number of trials to run per task.
-        If not specified, MetaSchedule will use a default value of 64
-        trials per task during the tuning process.
+        If not specified, it defaults to the value of `total_trials`, and this
+        may lead to undersubscribed tuning, potentially skipping some tasks
+        entirely. Explicitly setting both parameters avoids this issue and
+        provides deterministic resource allocation across all tasks.
         For optimal tuning, set `total_trials` to at least
         `max_trials_per_task * number_of_tuning_tasks` to ensure
         each task receives adequate tuning resources in one iteration.