addressed comments

ochougul · ochougul · commit 81cea107fb25 · 2025-02-12T17:48:31.000+05:30
Signed-off-by: Onkar Chougule &lt;quic_ochougul@quicinc.com&gt;
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -57,10 +57,10 @@ class CloudAI100ExecInfo:
     perf_metrics: PerfMetrics
 
     def __repr__(self):
-        return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\
-        \nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\
-        \nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\
-        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}"
+        return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\
+        \nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} tokens/sec\
+        \nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} tokens/sec\
+        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} tokens/sec"
 
 
 @dataclass
@@ -70,10 +70,10 @@ class CloudAI100ExecInfoNew:
     perf_metrics: PerfMetrics
 
     def __repr__(self):
-        return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\
-        \nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\
-        \nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\
-        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}"
+        return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\
+        \nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} token/sec\
+        \nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} token/sec\
+        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} sec"
 
 
 io_files = []
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
@@ -308,7 +308,7 @@ def _create_causal_mask(
     """
     A utility attention mask class that allows one to:
         - Create a causal 4d mask
-        - Create a causal 4d mask with slided window
+        - Create a causal 4d mask with sliding window
     """
     if sliding_window is not None:
         query_indices = position_ids.unsqueeze(-1)
diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -18,13 +18,13 @@ class QEffInternVLModel(nn.Module):
     def get_specializations(
         self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
     ):
-        # TODO: check if this should be named num_crops or something else
-        num_crops = compiler_options.get("num_crops", None)
-        if num_crops is None:
+        # TODO: check if this should be named num_patches or something else
+        num_patches = compiler_options.get("num_patches", None)
+        if num_patches is None:
             logger.warning(
-                "User should pass `num_crops` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13"
+                "User should pass `num_patches` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13"
             )
-            num_crops = 13
+            num_patches = 13
 
         prefill_seq_len = prefill_seq_len if prefill_seq_len else 3840  # 4096-256
         ctx_len = ctx_len if ctx_len else 4096
@@ -39,14 +39,14 @@ def get_specializations(
                 "batch_size": batch_size,
                 "seq_len": prefill_seq_len,
                 "ctx_len": ctx_len,
-                "num_crops": num_crops,
+                "num_patches": num_patches,
                 "img_size": img_size,
             },
             {
                 "batch_size": batch_size,
                 "seq_len": "1",
                 "ctx_len": ctx_len,
-                "num_crops": num_crops,
+                "num_patches": num_patches,
                 "img_size": img_size,
             },
         ]
@@ -58,7 +58,7 @@ def get_onnx_dynamic_axes(
         dynamic_axes = {}
         dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
         dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
-        dynamic_axes["pixel_values"] = {0: "num_crops", 2: "img_size", 3: "img_size"}
+        dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
 
         pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"}
         for i in range(self.language_model.config.num_hidden_layers):
@@ -79,12 +79,12 @@ def get_output_names(
     def get_dummy_inputs(self, kv_offload: bool = False):
         if kv_offload:
             raise ValueError("kv_offload method not supported for InternVL yet!")
-        NUM_CROPS = 13
+        num_patches = 13
         C = 3
         if vis_cfg := getattr(self.config, "vision_config", None):
-            img_size = getattr(vis_cfg, "image_size", 336)
+            img_size = getattr(vis_cfg, "image_size", 448)
         else:
-            img_size = 336
+            img_size = 448
 
         # Define shapes
         inputs_shapes = {}
@@ -93,7 +93,7 @@ def get_dummy_inputs(self, kv_offload: bool = False):
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
             constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
-        inputs_shapes["pixel_values"] = (NUM_CROPS, C, img_size, img_size)
+        inputs_shapes["pixel_values"] = (num_patches, C, img_size, img_size)
 
         # Define inputs
         inputs = {}
@@ -143,7 +143,7 @@ def get_inputs_info(self):
         return [
             IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")),
             IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")),
-            IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_crops", 3, "img_size", "img_size")),
+            IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_patches", 3, "img_size", "img_size")),
         ]
 
 
diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py
@@ -78,7 +78,6 @@ def get_dummy_inputs(self, **kwargs):
     def get_specializations(
         self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
     ):
-        # TODO: check if this should be named num_crops or something else
         max_num_images = compiler_options.get("max_num_images", 1)
         prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN
         ctx_len = ctx_len if ctx_len else CTX_LEN
diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -44,11 +44,9 @@
     _prepare_aspect_ratio_attention_mask,
     _prepare_cross_attention_mask,
 )
+from QEfficient.utils import constants
 from QEfficient.utils._utils import IOInfo
 
-CTX_LEN = 128
-SEQ_LEN = 32
-BS = 1
 MAX_NUM_IMG = 1
 NUM_CHANNEL = 3
 
@@ -388,9 +386,6 @@ def forward(
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
-            # attn_weights = torch.where(
-            #     attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights
-            # )
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
@@ -1119,6 +1114,10 @@ def forward(
         return outputs
 
     def get_dummy_inputs(self, kv_offload: bool = False):
+        BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
+        SEQ_LEN = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
+        CTX_LEN = constants.ONNX_EXPORT_CTX_LEN
+
         txt_cfg = self.config.get_text_config()
         num_hidden_layers = txt_cfg.num_hidden_layers
         cross_attention_layers = txt_cfg.cross_attention_layers
@@ -1192,11 +1191,9 @@ def get_specializations(
         **compiler_options,
     ):
         vis_cfg = self.config.vision_config
-
-        # TODO: check if this should be named num_crops or something else
         max_num_images = compiler_options.get("max_num_images", 1)
-        prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN
-        ctx_len = ctx_len if ctx_len else CTX_LEN
+        prefill_seq_len = prefill_seq_len if prefill_seq_len else 32
+        ctx_len = ctx_len if ctx_len else 128
         if img_size is None and hasattr(vis_cfg, "image_size"):
             img_size = getattr(vis_cfg, "image_size")
         elif img_size is None:
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -33,8 +33,8 @@
     KVCacheModuleMethodMapperTransform,
     KVCacheTransform,
     SpDTransform,
-    VlmKVOffloadTransorm,
-    VlmNoKVOffloadTransorm,
+    VlmKVOffloadTransform,
+    VlmNoKVOffloadTransform,
 )
 from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
 from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform
@@ -401,7 +401,7 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
         GPTQToMatmulNbitsTransform,
         CustomOpsTransform,
         KVCacheTransform,
-        VlmKVOffloadTransorm,
+        VlmKVOffloadTransform,
     ]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
@@ -454,7 +454,7 @@ def model_name(self) -> str:
         return mname
 
 
-class _QEffAutoModelForImageTextToText2QPC:
+class _QEffAutoModelForImageTextToTextDuaSingleQPC:
     UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"]
 
     def __init__(
@@ -788,7 +788,7 @@ class _QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase):
         CustomOpsTransform,
         KVCacheTransform,
         KVCacheModuleMethodMapperTransform,
-        VlmNoKVOffloadTransorm,
+        VlmNoKVOffloadTransform,
     ]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
@@ -1128,7 +1128,7 @@ def __new__(self, model: nn.Module, kv_offload=False, **kwargs):
             logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}")
 
         if kv_offload:
-            return _QEffAutoModelForImageTextToText2QPC(model, **kwargs)
+            return _QEffAutoModelForImageTextToTextDuaSingleQPC(model, **kwargs)
         else:
             return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs)
 
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -365,15 +365,15 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
         return model, transformed
 
 
-class VlmKVOffloadTransorm(ModuleMappingTransform):
+class VlmKVOffloadTransform(ModuleMappingTransform):
     # supported architectures
     _module_mapping = {
         # Llama
         MllamaTextCrossAttention: QEffMllamaTextCrossAttentionTwoQPC,
     }
 
 
-class VlmNoKVOffloadTransorm(ModuleMappingTransform):
+class VlmNoKVOffloadTransform(ModuleMappingTransform):
     # supported architectures
     _module_mapping = {
         # Llama