added unit tests for implicit passing of num_logits_to_keep

eplatero97 · eplatero97 · commit abd04e447d35 · 2024-10-21T12:07:38.000-05:00
Signed-off-by: eplatero &lt;quic_eplatero@quicinc.com&gt;
diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
@@ -12,7 +12,6 @@
 from typing import List, Optional, Tuple
 
 from QEfficient.utils.logging_utils import logger
-from QEfficient.utils.constants import NUM_LOGITS_TO_KEEP
 
 
 def create_and_dump_specializations(
@@ -22,10 +21,10 @@ def create_and_dump_specializations(
     path: str,
     is_dlm: bool,
     full_batch_size: Optional[int] = None,
-    num_logits_to_keep: Optional[int] = NUM_LOGITS_TO_KEEP,
+    num_speculative_tokens: Optional[int] = None,
 ):
     # Create specialization cfgs
-    decode_seq_len = 1 if num_logits_to_keep is None else num_logits_to_keep+1
+    decode_seq_len = 1 if num_speculative_tokens is None else num_speculative_tokens+1
     specialization_cfgs = [
         dict(batch_size=str(batch_size), seq_len=str(prompt_len), ctx_len=str(ctx_len)), # prefill
         dict(batch_size=str(batch_size), seq_len=str(decode_seq_len), ctx_len=str(ctx_len)) # decode
@@ -171,7 +170,7 @@ def compile(
         path=specialization_json_path,
         full_batch_size=full_batch_size,
         is_dlm=kwargs.get("is_dlm", False),
-        num_logits_to_keep=kwargs.get("num_logits_to_keep", NUM_LOGITS_TO_KEEP),
+        num_speculative_tokens=kwargs.get("num_speculative_tokens", None),
     )
 
     # Select the customIO config based on the mx flag.
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -21,7 +21,7 @@
 from QEfficient.transformers.modeling_utils import get_lists_of_cb_qeff_models
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.utils import load_hf_tokenizer
-from QEfficient.utils.constants import QEFF_MODELS_DIR, NUM_LOGITS_TO_KEEP, Constants
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.generate_inputs import InputHandler
 from QEfficient.utils.logging_utils import logger
 
@@ -196,7 +196,7 @@ def export_kvstyle_transformed_model_to_onnx(
     onnx_dir_path: str,
     seq_len: int,
     full_batch_size: Optional[int] = None,
-    num_logits_to_keep: Optional[int] = NUM_LOGITS_TO_KEEP,
+    num_speculative_tokens: Optional[int] = None,
 ) -> str:
     # Disabling requires_grad on all parameters
     for _, p in enumerate(transformed_model.parameters()):
@@ -205,13 +205,15 @@ def export_kvstyle_transformed_model_to_onnx(
     if seq_len <= 0:
         raise ValueError(f"Need seq_len to be greater than zero, got seq_len={seq_len}")
 
-    # Implicitly pass "num_logits_to_keep" if defined and \
-    # assert prompt_len >= num_logits_to_keep
+    # Implicitly pass "num_speculative_tokens" if defined and \
+    # assert prompt_len >= num_speculative_tokens
     prompt_len = Constants.PROMPT_LEN
-    if num_logits_to_keep is not None:
-        setattr(transformed_model, "num_logits_to_keep", num_logits_to_keep+1)
-        if prompt_len < num_logits_to_keep+1:
-            prompt_len *= math.ceil((num_logits_to_keep+1) / prompt_len)
+    num_logits_to_keep = None
+    if num_speculative_tokens is not None:
+        num_logits_to_keep = num_speculative_tokens+1
+        setattr(transformed_model, "num_logits_to_keep", num_logits_to_keep)
+        if prompt_len < num_logits_to_keep:
+            prompt_len *= math.ceil((num_logits_to_keep) / prompt_len)
 
     # Preprocess inputs
     # Build inputs for prefill
@@ -331,7 +333,7 @@ def export_for_cloud(
     onnx_dir_path: str,
     seq_length: int = Constants.SEQ_LEN,
     full_batch_size: Optional[int] = None,
-    num_logits_to_keep: Optional[int] = NUM_LOGITS_TO_KEEP,
+    num_speculative_tokens: Optional[int] = None,
 ) -> str:
     # Check if model architecture is supported for continuous batching.
     if full_batch_size and qeff_model.model.config.architectures[0] not in get_lists_of_cb_qeff_models.architectures:
@@ -348,7 +350,7 @@ def export_for_cloud(
             onnx_dir_path=onnx_dir_path,
             seq_length=seq_length,
             full_batch_size=full_batch_size,
-            num_logits_to_keep=num_logits_to_keep
+            num_speculative_tokens=num_speculative_tokens
         )
     else:
         raise NotImplementedError(
@@ -363,7 +365,7 @@ def export_lm_model_for_cloud(
     onnx_dir_path: str,
     seq_length: int,
     full_batch_size: Optional[int] = None,
-    num_logits_to_keep: Optional[int] = NUM_LOGITS_TO_KEEP,
+    num_speculative_tokens: Optional[int] = None,
 ) -> str:
     if os.path.exists(onnx_dir_path):
         logger.warning(f"Overriding {onnx_dir_path}")
@@ -377,7 +379,7 @@ def export_lm_model_for_cloud(
             onnx_dir_path=onnx_dir_path,
             seq_len=seq_length,
             full_batch_size=full_batch_size,
-            num_logits_to_keep=num_logits_to_keep,
+            num_speculative_tokens=num_speculative_tokens,
         )  # type: ignore
 
     else:
@@ -403,7 +405,7 @@ def qualcomm_efficient_converter(
     kv: bool = True,
     form_factor: str = "cloud",
     full_batch_size: Optional[int] = None,
-    num_logits_to_keep: Optional[int] = NUM_LOGITS_TO_KEEP,
+    num_speculative_tokens: Optional[int] = None,
 ) -> Tuple[str, str]:
     """
     This method is an alias for ``QEfficient.export``.
@@ -484,7 +486,7 @@ def qualcomm_efficient_converter(
             onnx_dir_path=onnx_dir_path,
             seq_length=seq_length,
             full_batch_size=full_batch_size,
-            num_logits_to_keep=num_logits_to_keep,
+            num_speculative_tokens=num_speculative_tokens,
         )
         return onnx_dir_path, generated_onnx_model_path
     else:
diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py
@@ -281,9 +281,6 @@ def generate_input_files(
     # inputFiles
     os.makedirs(input_files_path, exist_ok=True)
     filenames = []
-    if "num_logits_to_keep" in input_names:
-        idx = input_names.index("num_logits_to_keep")
-        del input_names[idx]
 
     for name in input_names:
         # We can't directly iterate with inputs.items() because
diff --git a/QEfficient/transformers/modeling_spd_utils.py b/QEfficient/transformers/modeling_spd_utils.py
@@ -33,30 +33,6 @@ def filter_hidden_states(
     if num_logits_to_keep is None:
         # return the last logit
         return hidden_states[batch_indices.view(-1, 1), logit_index]
-    # last valid `num_logits_to_keep` need to be computed
-
-    #upper_idx = torch.max(logit_index[0]+1, torch.tensor([num_logits], dtype=torch.int32))  
-    #upper_idx = logit_index[0]+1  
-    #lower_idx = upper_idx - num_logits
-    #return hidden_states[:, lower_idx:upper_idx] # fails
-    #return hidden_states[:, lower_idx.item():upper_idx.item()] # works
-    #return hidden_states[:, lower_idx:upper_idx] 
-    #return hidden_states[batch_indices.view(-1,1), lower_idx:upper_idx] 
-    #return hidden_states[batch_indices.view(-1), lower_idx:upper_idx] # fails: Slice
-    #return hidden_states[:, lower_idx.unsqueeze(0):upper_idx.unsqueeze(0)] # fails
-
-    # range operator approach (onnx pass, compile fail)
-    #indices = torch.arange(lower_idx[0], upper_idx[0])
-    #return hidden_states[batch_indices.view(-1,1), indices] # onnx pass, compile fail with: [Operator-'/Range_1'] : Range: Non-constant start tensor not supported.
-
-    # range operators approach v2 (onnx pass, compile fails)
-    #indices = torch.arange(lower_idx[0], upper_idx[0]).repeat(batch_size,1)
-    #return hidden_states[batch_indices.view(-1,1), indices] # onnx pass, compile fail with: Error message:  [Operator-'/Range_1'] : Range: Non-constant start tensor not supported.
-
-    # what if we repeat batch_indices to have 1-1 dimensions? (onnx pass, compile fail)
-    #indices = torch.arange(lower_idx[0], upper_idx[0]).repeat(batch_size,1)
-    #return hidden_states[batch_indices.view(-1,1).repeat(1,num_logits), indices] # onnx pass, compile fail with: [Operator-'/Range_1'] : Range: Non-constant start tensor not supported
-
     # topk approach
     topk_indices = torch.topk(position_ids, k=num_logits_to_keep, dim=1).indices.to(torch.int32)
     topk_indices = torch.flip(topk_indices, dims=[1]) # "left" padded input in case num_non_padded_tokens < num_logits_to_keep
diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py
@@ -31,7 +31,6 @@
 )
 
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
-from QEfficient.transformers.modeling_spd_utils import filter_hidden_states
 
 
 class QEffLlamaRotaryEmbedding(LlamaRotaryEmbedding):
@@ -241,7 +240,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        num_logits_to_keep: Optional[int] = 0,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -290,7 +288,8 @@ def forward(
         )
 
         # Cast to INT32 to avoid issue while running in ONNXRT
-        hidden_states = filter_hidden_states(outputs[0], position_ids, num_logits_to_keep)
+        logit_index = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        hidden_states = outputs[0][torch.arange(position_ids.shape[0]).view(-1, 1), logit_index]
         if self.config.pretraining_tp > 1:
             lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
             logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -20,7 +20,7 @@
 from QEfficient.transformers.quantizers.quantizer_awq import QEffAwqConfig
 from QEfficient.transformers.quantizers.quantizer_gptq import QEffGPTQConfig
 from QEfficient.utils import get_qpc_dir_path, load_hf_tokenizer
-from QEfficient.utils.constants import QEFF_MODELS_DIR, NUM_LOGITS_TO_KEEP
+from QEfficient.utils.constants import QEFF_MODELS_DIR
 from QEfficient.utils.logging_utils import logger
 
 # Dictionary that defines the interface from transformers to be used underneath the QEFF interface
@@ -58,7 +58,7 @@ def __init__(self, model: nn.Module, pretrained_model_name_or_path: str, **kwarg
             self.model_card_name = self.pretrained_model_name_or_path
 
         self.full_batch_size = kwargs.get("full_batch_size", None)
-        self.num_logits_to_keep = kwargs.get("num_logits_to_keep", NUM_LOGITS_TO_KEEP)
+        self.num_speculative_tokens = kwargs.get("num_speculative_tokens", None)
         self.is_dlm = kwargs.get("is_dlm", False)
         self.kwargs = kwargs
         self._tokenizer = None
@@ -105,7 +105,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
 
         full_batch_size = kwargs.pop("full_batch_size", None)
 
-        num_logits_to_keep = kwargs.pop("num_logits_to_keep", NUM_LOGITS_TO_KEEP)
+        num_speculative_tokens = kwargs.pop("num_speculative_tokens", None)
         is_dlm = kwargs.pop("is_dlm", False)
 
         attn_implementation = kwargs.get("attn_implementation", None)
@@ -125,7 +125,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs):
             pretrained_model_name_or_path=pretrained_model_name_or_path,
             model_card_name=model_card_name,
             full_batch_size=full_batch_size,
-            num_logits_to_keep=num_logits_to_keep,
+            num_speculative_tokens=num_speculative_tokens,
             is_dlm=is_dlm,
             **kwargs,
         )
@@ -167,15 +167,15 @@ class QEFFAutoModelForCausalLM(QEFFTransformersBase):
 
     def transform(
         self, 
-        num_logits_to_keep: Optional[int] = NUM_LOGITS_TO_KEEP,
+        num_speculative_tokens: Optional[int] = None,
         is_dlm: bool = False,
         **kwargs):
         """
         This method applies all relevant optimization transforms on the model and toggles the ``self.is_transformed`` attribute to True. If the model is already transformed, the method will simply return.
         Please note that this method does not require any input arguments."
 
         ``Optional`` Args:
-            :num_logits_to_keep (int, optional): Number of speculative tokens, specified only for TLM SpD model.
+            :num_speculative_tokens (int, optional): Number of speculative tokens, specified only for TLM SpD model.
             :is_dlm (bool): True if this is a DLM SpD model.
 
         Returns:
@@ -202,11 +202,11 @@ def transform(
             if isinstance(self.model.config.quantization_config, QEffGPTQConfig):
                 self._pytorch_transforms.insert(0, GPTQToMatmulNbitsTransform)
         
-        if num_logits_to_keep is not None:
-            if not isinstance(num_logits_to_keep, int) or num_logits_to_keep<2:
-                ValueError("`num_logits_to_keep` arg should be an integer greater than 1.")
+        if num_speculative_tokens is not None:
+            if not isinstance(num_speculative_tokens, int) or num_speculative_tokens<2:
+                ValueError("`num_speculative_tokens` arg should be an integer greater than 1.")
             if is_dlm:
-                raise ValueError("`num_logits_to_keep` arg and `is_dlm` flag are mutually exclusive.")
+                raise ValueError("`num_speculative_tokens` arg and `is_dlm` flag are mutually exclusive.")
             self._pytorch_transforms.append(SpDTransform)
 
         for transform in self._pytorch_transforms:
@@ -239,7 +239,7 @@ def export(self) -> str:
             model_kv=self,
             tokenizer=self.tokenizer,
             full_batch_size=self.full_batch_size,
-            num_logits_to_keep=self.num_logits_to_keep,
+            num_speculative_tokens=self.num_speculative_tokens,
         )
         self.onnx_path = onnx_model_path
 
@@ -311,8 +311,8 @@ def compile(
             mxfp6=mxfp6,
             mxint8=mxint8,
             full_batch_size=self.full_batch_size,
-            num_logits_to_keep=self.num_logits_to_keep,
-            is_dlm=getattr(self.model, "is_dlm", False),
+            num_speculative_tokens=self.num_speculative_tokens,
+            is_dlm=self.is_dlm,
         )
         self.qpc_path = qpc_dir_path
         return self.qpc_path
@@ -375,8 +375,8 @@ def export_and_compile(
             mxfp6=mxfp6,
             mxint8=mxint8,
             full_batch_size=full_batch_size,
-            num_logits_to_keep=self.num_logits_to_keep,
-            is_dlm=getattr(self.model, "is_dlm", False),
+            num_speculative_tokens=self.num_speculative_tokens,
+            is_dlm=self.is_dlm,
         )
         return self.qpc_path
 
diff --git a/QEfficient/transformers/models/spd/modeling_tlm.py b/QEfficient/transformers/models/spd/modeling_tlm.py
@@ -12,7 +12,6 @@
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from QEfficient.utils.constants import NUM_LOGITS_TO_KEEP
 from QEfficient.transformers.modeling_spd_utils import filter_hidden_states
 
 def tlm_forward(
@@ -29,7 +28,7 @@ def tlm_forward(
     output_hidden_states: Optional[bool] = None,
     return_dict: Optional[bool] = None,
     cache_position: Optional[torch.LongTensor] = None,
-    #num_logits_to_keep: Optional[torch.LongTensor] = None,
+    #num_logits_to_keep: Optional[torch.LongTensor] = None, # explicit passing is not currently supported
 ) -> Union[Tuple, CausalLMOutputWithPast]:
     r"""
     Args:
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -7,7 +7,6 @@
 
 import os
 
-NUM_LOGITS_TO_KEEP = None
 UTILS_DIR = os.path.dirname(os.path.abspath(__file__))
 QEFF_DIR = os.path.dirname(UTILS_DIR)
 ROOT_DIR = os.path.dirname(QEFF_DIR)
diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py
diff --git a/tests/spd/test_tlm_dlm_export_and_compile.py b/tests/spd/test_tlm_dlm_export_and_compile.py