Address the comments

cehongwang · cehongwang · commit fe7350886892 · 2025-06-30T18:49:02.000Z
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
@@ -91,13 +91,18 @@ def construct_refit_mapping_from_weight_name_map(
 ) -> dict[Any, Any]:
     engine_weight_map = {}
     for engine_weight_name, (sd_weight_name, np_weight_type) in weight_name_map.items():
+        # Add more constant folding converters here
         if engine_weight_name.split(" ")[-1] in ["SCALE", "SHIFT"]:
             # Batch Norm Layer
             params = {}
             for w in sd_weight_name:
                 params[w.split(".")[-1]] = state_dict[w].cuda()
-            scale = params["weight"] / torch.sqrt(params["running_var"] + 1e-7)
-            shift = params["bias"] - params["running_mean"] * scale
+            # Batch norm constant folding
+            from torch_tensorrt.dynamo.conversion.impl.normalization.ops import (
+                batch_norm_constant_folding,
+            )
+
+            scale, shift = batch_norm_constant_folding(**params, eps=1e-7)
             # Set scale to scale or shift to shift
             engine_weight_map[engine_weight_name] = eval(
                 engine_weight_name.split(" ")[-1].lower()
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py
@@ -51,77 +51,14 @@ def batch_norm(
     # We perform constant folding for batch norm when the weight, bias, running_mean, and running_var are all tensors.
     # Batch norm operation can be fused into a single layer, which is more efficient than the original implementation.
     # In this way, the batch norm layer will be fused with the Convolution layer and get a performance boost.
-    if all(
+    if not all(
         [
             isinstance(weight, torch.Tensor),
             isinstance(bias, torch.Tensor),
             isinstance(running_mean, torch.Tensor),
             isinstance(running_var, torch.Tensor),
         ]
     ):
-        if weight is None:
-            weight = 1.0
-
-        if bias is None:
-            bias = 0.0
-
-        if running_mean is None:
-            running_mean = 0.0
-
-        if running_var is None:
-            running_var = 1.0
-        adjusted_scale = weight / torch.sqrt(running_var + eps)
-        adjusted_bias = bias - running_mean * adjusted_scale
-        power = torch.ones_like(adjusted_scale)
-        adjusted_scale = to_trt_weights(
-            ctx,
-            adjusted_scale,
-            name,
-            layer_type_name="SCALE",
-            weight_type_name="SCALE",
-            target=target,
-            source_ir=source_ir,
-        )
-        adjusted_bias = to_trt_weights(
-            ctx,
-            adjusted_bias,
-            name,
-            layer_type_name="SCALE",
-            weight_type_name="SHIFT",
-            target=target,
-            source_ir=source_ir,
-        )
-
-        power = to_trt_weights(
-            ctx,
-            power,
-            name,
-            layer_type_name="SCALE",
-            weight_type_name="POWER",
-            target=target,
-            source_ir=source_ir,
-        )
-
-        output_shape = input.shape
-        if len(input.shape) < 4:
-
-            new_shape = (
-                (input.shape[0], input.shape[1], 1, 1)
-                if len(input.shape) == 2
-                else (input.shape[0], input.shape[1], input.shape[2], 1)
-            )
-            input = impl.shuffle.reshape(
-                ctx, target, source_ir, f"{name}_reshape_2d", input, new_shape
-            )
-
-        layer = ctx.net.add_scale_nd(
-            input, trt.ScaleMode.CHANNEL, adjusted_bias, adjusted_scale, power, 1
-        )
-        set_layer_name(layer, target, name, source_ir)
-        output = layer.get_output(0)
-
-    else:
-
         # We name the weight here according to the state_dict name
         weight = (
             get_trt_tensor(ctx, 1.0, f"{name}_weight")
@@ -206,6 +143,70 @@ def batch_norm(
             bias_adjusted_reshape,
         )
 
+    else:
+        if weight is None:
+            weight = 1.0
+
+        if bias is None:
+            bias = 0.0
+
+        if running_mean is None:
+            running_mean = 0.0
+
+        if running_var is None:
+            running_var = 1.0
+        adjusted_scale, adjusted_bias = batch_norm_constant_folding(
+            weight, bias, running_mean, running_var, eps
+        )
+        power = torch.ones_like(adjusted_scale)
+
+        adjusted_scale = to_trt_weights(
+            ctx,
+            adjusted_scale,
+            name,
+            layer_type_name="SCALE",
+            weight_type_name="SCALE",
+            target=target,
+            source_ir=source_ir,
+        )
+        adjusted_bias = to_trt_weights(
+            ctx,
+            adjusted_bias,
+            name,
+            layer_type_name="SCALE",
+            weight_type_name="SHIFT",
+            target=target,
+            source_ir=source_ir,
+        )
+
+        power = to_trt_weights(
+            ctx,
+            power,
+            name,
+            layer_type_name="SCALE",
+            weight_type_name="POWER",
+            target=target,
+            source_ir=source_ir,
+        )
+
+        output_shape = input.shape
+        if len(input.shape) < 4:
+
+            new_shape = (
+                (input.shape[0], input.shape[1], 1, 1)
+                if len(input.shape) == 2
+                else (input.shape[0], input.shape[1], input.shape[2], 1)
+            )
+            input = impl.shuffle.reshape(
+                ctx, target, source_ir, f"{name}_reshape_2d", input, new_shape
+            )
+
+        layer = ctx.net.add_scale_nd(
+            input, trt.ScaleMode.CHANNEL, adjusted_bias, adjusted_scale, power, 1
+        )
+        set_layer_name(layer, target, name, source_ir)
+        output = layer.get_output(0)
+
     # For BatchNorm1d, reshape output back to original shape if necessary
     if len(output_shape) < 4:
         output = impl.shuffle.reshape(
@@ -224,6 +225,18 @@ def batch_norm(
     return output
 
 
+def batch_norm_constant_folding(
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    running_mean: torch.Tensor,
+    running_var: torch.Tensor,
+    eps: float,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    adjusted_scale = weight / torch.sqrt(running_var + eps)
+    adjusted_bias = bias - running_mean * adjusted_scale
+    return adjusted_scale, adjusted_bias
+
+
 def native_layer_norm(
     ctx: ConversionContext,
     target: Target,
@@ -303,7 +316,7 @@ def native_group_norm(
         ctx, target, source_ir, f"{name}_expand_bias_zero", bias_zero, shape
     )
 
-    axes = get_axes_for_reduce_op([i for i in range(1 if group == 1 else 2, rank)])
+    axes = get_axes_for_reduce_op(list(range(1 if group == 1 else 2, rank)))
 
     # INormalizationLayer scales the normalized output per-group, but PyTorch scales the normalized output per-channel,
     # hence causing diverse result. Let TensorRT does no-op for scaling here, and do scaling ourselves later