diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
index c914ee46a8..d72c270667 100644
--- a/deepmd/pd/train/training.py
+++ b/deepmd/pd/train/training.py
@@ -599,6 +599,49 @@ def warm_up_linear(step, warmup_steps):
         else:
             raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
 
+        # NOTE: to_static + compiler should be before distributed wrapper
+        if CINN:
+            from paddle import (
+                jit,
+                static,
+            )
+
+            backend = "CINN" if CINN else None
+            self.wrapper.forward = jit.to_static(
+                backend=backend,
+                input_spec=[
+                    static.InputSpec([1, -1, 3], "float64", name="coord"),  # coord
+                    static.InputSpec([1, -1], "int32", name="atype"),  # atype
+                    None,  # spin
+                    static.InputSpec([1, 9], "float64", name="box"),  # box
+                    static.InputSpec([], "float64", name="cur_lr"),  # cur_lr
+                    {
+                        "find_box": np.float32(1.0),
+                        "find_coord": np.float32(1.0),
+                        "find_numb_copy": np.float32(0.0),
+                        "numb_copy": static.InputSpec(
+                            [1, 1], "int64", name="numb_copy"
+                        ),
+                        "find_energy": np.float32(1.0),
+                        "energy": static.InputSpec([1, 1], "float64", name="energy"),
+                        "find_force": np.float32(1.0),
+                        "force": static.InputSpec([1, -1, 3], "float64", name="force"),
+                        "natoms": static.InputSpec([1, -1], "int32", name="natoms"),
+                    },  # label,
+                    # None, # task_key
+                    # False, # inference_only
+                    # False, # do_atomic_virial
+                    # None, # fparam
+                    # None, # aparam
+                ],
+                full_graph=True,
+            )(self.wrapper.forward)
+
+            log.info(
+                "Enable CINN during training, there may be some additional "
+                "compilation time in the first traning step."
+            )
+
         if dist.is_available() and dist.is_initialized():
             # DDP will guarantee the model parameters are identical across all processes
             self.wrapper = fleet.distributed_model(
@@ -631,20 +674,6 @@ def warm_up_linear(step, warmup_steps):
         self.profiling_file = training_params.get("profiling_file", "timeline.json")
 
     def run(self) -> None:
-        if CINN:
-            from paddle import (
-                jit,
-            )
-
-            backend = "CINN" if CINN else None
-            self.wrapper.forward = jit.to_static(full_graph=True, backend=backend)(
-                self.wrapper.forward
-            )
-            log.info(
-                "Enable CINN during training, there may be some additional "
-                "compilation time in the first traning step."
-            )
-
         fout = (
             open(
                 self.disp_file,
diff --git a/source/api_cc/src/DeepPotPD.cc b/source/api_cc/src/DeepPotPD.cc
index c5f9391ca9..3a3d880c4b 100644
--- a/source/api_cc/src/DeepPotPD.cc
+++ b/source/api_cc/src/DeepPotPD.cc
@@ -120,6 +120,24 @@ void DeepPotPD::init(const std::string& model,
     std::cout << "load model from: " << model << " to gpu:" << gpu_id
               << std::endl;
   }
+  if (config->cinn_enabled()) {
+    std::cout << "model.forward will be compiled with cinn." << std::endl;
+  } else {
+    std::cout << "NOTE: You can try: \n'export FLAGS_prim_all=true"
+                 " FLAGS_enable_pir_in_executor=1"
+                 " FLAGS_prim_enable_dynamic=true FLAGS_use_cinn=true'\n"
+                 "to speed up C++ inference with paddle backend"
+              << std::endl;
+  }
+  if (config_fl->cinn_enabled()) {
+    std::cout << "model.forward_lower will be compiled with cinn." << std::endl;
+  } else {
+    std::cout << "NOTE: You can try: \n'export FLAGS_prim_all=true"
+                 " FLAGS_enable_pir_in_executor=1"
+                 " FLAGS_prim_enable_dynamic=true FLAGS_use_cinn=true'\n"
+                 "to speed up C++ inference with paddle backend"
+              << std::endl;
+  }
 
   // NOTE: Both set to 1 now.
   // get_env_nthreads(num_intra_nthreads,