Skip to content

Commit 81cea10

Browse files
committed
addressed comments
Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
1 parent a32007e commit 81cea10

File tree

7 files changed

+37
-41
lines changed

7 files changed

+37
-41
lines changed

QEfficient/generation/text_generation_inference.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@ class CloudAI100ExecInfo:
5757
perf_metrics: PerfMetrics
5858

5959
def __repr__(self):
60-
return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\
61-
\nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\
62-
\nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\
63-
\nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}"
60+
return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\
61+
\nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} tokens/sec\
62+
\nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} tokens/sec\
63+
\nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} tokens/sec"
6464

6565

6666
@dataclass
@@ -70,10 +70,10 @@ class CloudAI100ExecInfoNew:
7070
perf_metrics: PerfMetrics
7171

7272
def __repr__(self):
73-
return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\
74-
\nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\
75-
\nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\
76-
\nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}"
73+
return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\
74+
\nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} token/sec\
75+
\nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} token/sec\
76+
\nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} sec"
7777

7878

7979
io_files = []

QEfficient/transformers/modeling_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ def _create_causal_mask(
308308
"""
309309
A utility attention mask class that allows one to:
310310
- Create a causal 4d mask
311-
- Create a causal 4d mask with slided window
311+
- Create a causal 4d mask with sliding window
312312
"""
313313
if sliding_window is not None:
314314
query_indices = position_ids.unsqueeze(-1)

QEfficient/transformers/models/internvl/modeling_internvl.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ class QEffInternVLModel(nn.Module):
1818
def get_specializations(
1919
self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
2020
):
21-
# TODO: check if this should be named num_crops or something else
22-
num_crops = compiler_options.get("num_crops", None)
23-
if num_crops is None:
21+
# TODO: check if this should be named num_patches or something else
22+
num_patches = compiler_options.get("num_patches", None)
23+
if num_patches is None:
2424
logger.warning(
25-
"User should pass `num_crops` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13"
25+
"User should pass `num_patches` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13"
2626
)
27-
num_crops = 13
27+
num_patches = 13
2828

2929
prefill_seq_len = prefill_seq_len if prefill_seq_len else 3840 # 4096-256
3030
ctx_len = ctx_len if ctx_len else 4096
@@ -39,14 +39,14 @@ def get_specializations(
3939
"batch_size": batch_size,
4040
"seq_len": prefill_seq_len,
4141
"ctx_len": ctx_len,
42-
"num_crops": num_crops,
42+
"num_patches": num_patches,
4343
"img_size": img_size,
4444
},
4545
{
4646
"batch_size": batch_size,
4747
"seq_len": "1",
4848
"ctx_len": ctx_len,
49-
"num_crops": num_crops,
49+
"num_patches": num_patches,
5050
"img_size": img_size,
5151
},
5252
]
@@ -58,7 +58,7 @@ def get_onnx_dynamic_axes(
5858
dynamic_axes = {}
5959
dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
6060
dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
61-
dynamic_axes["pixel_values"] = {0: "num_crops", 2: "img_size", 3: "img_size"}
61+
dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
6262

6363
pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"}
6464
for i in range(self.language_model.config.num_hidden_layers):
@@ -79,12 +79,12 @@ def get_output_names(
7979
def get_dummy_inputs(self, kv_offload: bool = False):
8080
if kv_offload:
8181
raise ValueError("kv_offload method not supported for InternVL yet!")
82-
NUM_CROPS = 13
82+
num_patches = 13
8383
C = 3
8484
if vis_cfg := getattr(self.config, "vision_config", None):
85-
img_size = getattr(vis_cfg, "image_size", 336)
85+
img_size = getattr(vis_cfg, "image_size", 448)
8686
else:
87-
img_size = 336
87+
img_size = 448
8888

8989
# Define shapes
9090
inputs_shapes = {}
@@ -93,7 +93,7 @@ def get_dummy_inputs(self, kv_offload: bool = False):
9393
constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
9494
constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
9595
)
96-
inputs_shapes["pixel_values"] = (NUM_CROPS, C, img_size, img_size)
96+
inputs_shapes["pixel_values"] = (num_patches, C, img_size, img_size)
9797

9898
# Define inputs
9999
inputs = {}
@@ -143,7 +143,7 @@ def get_inputs_info(self):
143143
return [
144144
IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")),
145145
IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")),
146-
IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_crops", 3, "img_size", "img_size")),
146+
IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_patches", 3, "img_size", "img_size")),
147147
]
148148

149149

QEfficient/transformers/models/llava/modeling_llava.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ def get_dummy_inputs(self, **kwargs):
7878
def get_specializations(
7979
self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
8080
):
81-
# TODO: check if this should be named num_crops or something else
8281
max_num_images = compiler_options.get("max_num_images", 1)
8382
prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN
8483
ctx_len = ctx_len if ctx_len else CTX_LEN

QEfficient/transformers/models/mllama/modeling_mllama.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,9 @@
4444
_prepare_aspect_ratio_attention_mask,
4545
_prepare_cross_attention_mask,
4646
)
47+
from QEfficient.utils import constants
4748
from QEfficient.utils._utils import IOInfo
4849

49-
CTX_LEN = 128
50-
SEQ_LEN = 32
51-
BS = 1
5250
MAX_NUM_IMG = 1
5351
NUM_CHANNEL = 3
5452

@@ -388,9 +386,6 @@ def forward(
388386
if attention_mask is not None: # no matter the length, we just slice it
389387
causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
390388
attn_weights = attn_weights + causal_mask
391-
# attn_weights = torch.where(
392-
# attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights
393-
# )
394389

395390
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
396391
attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
@@ -1119,6 +1114,10 @@ def forward(
11191114
return outputs
11201115

11211116
def get_dummy_inputs(self, kv_offload: bool = False):
1117+
BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
1118+
SEQ_LEN = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
1119+
CTX_LEN = constants.ONNX_EXPORT_CTX_LEN
1120+
11221121
txt_cfg = self.config.get_text_config()
11231122
num_hidden_layers = txt_cfg.num_hidden_layers
11241123
cross_attention_layers = txt_cfg.cross_attention_layers
@@ -1192,11 +1191,9 @@ def get_specializations(
11921191
**compiler_options,
11931192
):
11941193
vis_cfg = self.config.vision_config
1195-
1196-
# TODO: check if this should be named num_crops or something else
11971194
max_num_images = compiler_options.get("max_num_images", 1)
1198-
prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN
1199-
ctx_len = ctx_len if ctx_len else CTX_LEN
1195+
prefill_seq_len = prefill_seq_len if prefill_seq_len else 32
1196+
ctx_len = ctx_len if ctx_len else 128
12001197
if img_size is None and hasattr(vis_cfg, "image_size"):
12011198
img_size = getattr(vis_cfg, "image_size")
12021199
elif img_size is None:

QEfficient/transformers/models/modeling_auto.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333
KVCacheModuleMethodMapperTransform,
3434
KVCacheTransform,
3535
SpDTransform,
36-
VlmKVOffloadTransorm,
37-
VlmNoKVOffloadTransorm,
36+
VlmKVOffloadTransform,
37+
VlmNoKVOffloadTransform,
3838
)
3939
from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
4040
from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform
@@ -401,7 +401,7 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
401401
GPTQToMatmulNbitsTransform,
402402
CustomOpsTransform,
403403
KVCacheTransform,
404-
VlmKVOffloadTransorm,
404+
VlmKVOffloadTransform,
405405
]
406406
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
407407

@@ -454,7 +454,7 @@ def model_name(self) -> str:
454454
return mname
455455

456456

457-
class _QEffAutoModelForImageTextToText2QPC:
457+
class _QEffAutoModelForImageTextToTextDuaSingleQPC:
458458
UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"]
459459

460460
def __init__(
@@ -788,7 +788,7 @@ class _QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase):
788788
CustomOpsTransform,
789789
KVCacheTransform,
790790
KVCacheModuleMethodMapperTransform,
791-
VlmNoKVOffloadTransorm,
791+
VlmNoKVOffloadTransform,
792792
]
793793
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
794794

@@ -1128,7 +1128,7 @@ def __new__(self, model: nn.Module, kv_offload=False, **kwargs):
11281128
logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}")
11291129

11301130
if kv_offload:
1131-
return _QEffAutoModelForImageTextToText2QPC(model, **kwargs)
1131+
return _QEffAutoModelForImageTextToTextDuaSingleQPC(model, **kwargs)
11321132
else:
11331133
return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs)
11341134

QEfficient/transformers/models/pytorch_transforms.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,15 +365,15 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
365365
return model, transformed
366366

367367

368-
class VlmKVOffloadTransorm(ModuleMappingTransform):
368+
class VlmKVOffloadTransform(ModuleMappingTransform):
369369
# supported architectures
370370
_module_mapping = {
371371
# Llama
372372
MllamaTextCrossAttention: QEffMllamaTextCrossAttentionTwoQPC,
373373
}
374374

375375

376-
class VlmNoKVOffloadTransorm(ModuleMappingTransform):
376+
class VlmNoKVOffloadTransform(ModuleMappingTransform):
377377
# supported architectures
378378
_module_mapping = {
379379
# Llama

0 commit comments

Comments
 (0)