added back mxint8 export and compilation

eplatero97 · eplatero97 · commit fa058b7bcc21 · 2024-10-21T12:07:38.000-05:00
Signed-off-by: eplatero &lt;quic_eplatero@quicinc.com&gt;
diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py
@@ -116,12 +116,14 @@ def update_pytorch_inputs(self, inputs, pt_outputs):
         if self.full_batch_size:
             # Create CB inputs (make 1 batch index have proper inputs for decode pass)
             batch_index = torch.arange(1).view(-1, 1)
-            batch_idx_input_ids = pt_outputs.logits.detach().argmax(2)
+            batch_idx_input_ids = pt_outputs.logits.detach().argmax(2) # shape: [batch_size, num_logits_to_keep]
             input_ids = torch.full((self.full_batch_size, decode_len), self.tokenizer.pad_token_id)
             input_ids[batch_index.view(-1)] = batch_idx_input_ids
+
             position_ids = torch.full((self.full_batch_size, decode_len), 0)
             batch_idx_position_ids = torch.arange(decode_len).view(1,-1) + (inputs["position_ids"].max(1, keepdim=True).values + 1)
             position_ids[batch_index.view(-1)] = batch_idx_position_ids
+
             updated_inputs["input_ids"] = input_ids
             updated_inputs["position_ids"] = position_ids
             updated_inputs["batch_index"] = torch.arange(self.full_batch_size).view(-1, 1)
@@ -132,7 +134,7 @@ def update_pytorch_inputs(self, inputs, pt_outputs):
                 batch_size = input_ids.size(0)
                 position_ids = torch.arange(self.num_logits_to_keep).view(1, self.num_logits_to_keep).repeat(batch_size, 1)
             else:
-                input_ids = pt_outputs["logits"].argmax(-1).reshape(-1, 1)
+                input_ids = pt_outputs["logits"].argmax(-1).reshape(-1, 1) # shape: [batch_size, 1]
                 position_ids = inputs["position_ids"].max(1, keepdim=True).values + 1
             updated_inputs["input_ids"] = input_ids
             updated_inputs["position_ids"] = position_ids
diff --git a/tests/spd/test_tlm_dlm_export_and_compile.py b/tests/spd/test_tlm_dlm_export_and_compile.py
@@ -16,23 +16,23 @@
 
 configs = [
     pytest.param(
-        [0], # device_group
-        2, # num_speculative_tokens
-        32, # prompt_len
-        128, # ctx_len
-        1, # prefill_bsz
-        8, # full_batch_size
-        "JackFram/llama-68m", # model_name
+        [0],  # device_group
+        2,  # num_speculative_tokens
+        32,  # prompt_len
+        128,  # ctx_len
+        1,  # prefill_bsz
+        8,  # full_batch_size
+        "JackFram/llama-68m",  # model_name
         id="CB llama",
     ),
     pytest.param(
-        [0], # device_group
-        2, # num_speculative_tokens
-        32, # prompt_len
-        128, # ctx_len
-        1, # prefill_bsz
-        None, # full_batch_size
-        "JackFram/llama-68m", # model_name
+        [0],  # device_group
+        2,  # num_speculative_tokens
+        32,  # prompt_len
+        128,  # ctx_len
+        1,  # prefill_bsz
+        None,  # full_batch_size
+        "JackFram/llama-68m",  # model_name
         id="non-CB llama",
     ),
 ]
@@ -63,7 +63,7 @@ def test_llama_tlm_logit_dims(
         prompt_len=prompt_len,
         ctx_len=ctx_len,
         mxfp6=True,
-#        mxint8=True,
+        mxint8=True,
         full_batch_size=full_batch_size,
     )
 
@@ -126,7 +126,7 @@ def test_llama_dlm_logit_dims(
         prompt_len=prompt_len,
         ctx_len=ctx_len,
         mxfp6=True,
-#        mxint8=True,
+        mxint8=True,
         full_batch_size=full_batch_size,
     )