Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions docs/how_to/tutorials/e2e_opt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,38 @@
# leverage MetaSchedule to tune the model and store the tuning logs to the database. We also
# apply the database to the model to get the best performance.
#
# The ResNet18 model will be divided into 20 independent tuning tasks during compilation.
# To ensure each task receives adequate tuning resources in one iteration while providing
# early feedback:
#
# - To quickly observe tuning progress, each task is allocated a maximum of 16 trials per
# iteration (controlled by ``MAX_TRIALS_PER_TASK=16``). We should set ``TOTAL_TRIALS``
# to at least ``320 (20 tasks * 16 trials)`` ensures every task receives one full iteration
Comment thread
cbalint13 marked this conversation as resolved.
# of tuning. We set it to 512 in our configuration to allow for several more iterations,
# aiming to explore a wider parameter space and potentially achieve better performance.
# - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``TOTAL_TRIALS`` trials per
# task per iteration. An insufficient ``TOTAL_TRIALS`` setting may lead to undersubscribed
# tuning, potentially skipping some tasks entirely. Explicitly setting both parameters
# avoids this issue and provides deterministic resource allocation across all tasks.
#
# Note: These parameter settings are optimized for quick tutorial demonstration. For production
# deployments requiring higher performance, we recommend adjusting both ``MAX_TRIALS_PER_TASK``
# and ``TOTAL_TRIALS`` to larger values. This allows more extensive search space exploration
# and typically yields better performance outcomes.

TOTAL_TRIALS = 8000 # Change to 20000 for better performance if needed
TOTAL_TRIALS = 512 # Change to 20000 for better performance if needed
MAX_TRIALS_PER_TASK = 16 # Change to more trials per task for better performance if needed
target = tvm.target.Target("nvidia/geforce-rtx-3090-ti") # Change to your target device
work_dir = "tuning_logs"

if not IS_IN_CI:
mod = relax.get_pipeline("static_shape_tuning", target=target, total_trials=TOTAL_TRIALS)(mod)
mod = relax.get_pipeline(
"static_shape_tuning",
target=target,
work_dir=work_dir,
total_trials=TOTAL_TRIALS,
max_trials_per_task=MAX_TRIALS_PER_TASK,
)(mod)

# Only show the main function
mod["main"].show()
Expand Down
21 changes: 19 additions & 2 deletions python/tvm/relax/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
as it is or serves as a basis to do further composition.
"""
# pylint: disable=unused-argument
from typing import Union
from typing import Union, Optional

import tvm
from tvm import meta_schedule as ms
Expand Down Expand Up @@ -111,6 +111,7 @@ def static_shape_tuning_pipeline(
target: Union[str, tvm.target.Target],
work_dir: str = "tuning_logs",
cpu_weight_prepack: bool = False,
max_trials_per_task: Optional[int] = None,
):
"""Tune the static shape model and store the log to database.

Expand All @@ -128,6 +129,16 @@ def static_shape_tuning_pipeline(
cpu_weight_prepack : bool
Whether to enable the cpu weight prepack feature.

max_trials_per_task : Optional[int]
The maximum number of trials to run per task.
If not specified, it defaults to the value of `total_trials`, and this
may lead to undersubscribed tuning, potentially skipping some tasks
entirely. Explicitly setting both parameters avoids this issue and
provides deterministic resource allocation across all tasks.
For optimal tuning, set `total_trials` to at least
`max_trials_per_task * number_of_tuning_tasks` to ensure
each task receives adequate tuning resources in one iteration.

Note
----
`cpu_weight_prepack` is expected to be `True` when running on CPU for
Expand All @@ -142,6 +153,7 @@ def static_shape_tuning_pipeline(
target="llvm -num-cores 16",
work_dir="tuning_logs",
cpu_weight_prepack=True,
max_trials_per_task=64,
)(mod)

ex = tvm.compile(mod, target=target)
Expand Down Expand Up @@ -177,7 +189,12 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
*pre_tuning_layout_rewrite,
# Skip tuning if total_trials is 0
(
transform.MetaScheduleTuneIRMod({}, work_dir, total_trials)
transform.MetaScheduleTuneIRMod(
params={},
work_dir=work_dir,
max_trials_global=total_trials,
max_trials_per_task=max_trials_per_task,
)
if total_trials > 0
else tvm.transform.Sequential([])
),
Expand Down