diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py
index c9a39bbc53b5..20ecd64052ec 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py
@@ -6,8 +6,8 @@
 # support/questions/maintenance: github user @brunomaga or @deepspeedai/deepspeed
 
 import random
-import torch
 import os
+import math
 import numpy as np
 from torch.optim.lr_scheduler import LRScheduler
 from torch.optim.optimizer import Optimizer
@@ -156,7 +156,7 @@ def scale_lr(base_batch_size, batch_size, base_lr=1, method="linear"):
         # Square Root scaling: "when multiplying the batch size by k, multiply the learning rate
         # by √k, to keep the variance in the gradient expectation constant"
         # (A. Krizhevsky. One weird trick for parallelizing convolutional neural networks)
-        return base_lr * torch.sqrt(batch_size / base_batch_size)
+        return base_lr * math.sqrt(batch_size / base_batch_size)
     elif method == None or method.upper() == "NONE":
         return base_lr
     raise ValueError("Unknown scaling method: {}".format(method))