From 0b80dacf623150a0359272994246e54e4a1cd772 Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Wed, 20 Nov 2024 22:24:09 +0530
Subject: [PATCH 01/17] Docker-driven tests with latest SDKs (#180)

* Added Docker support to the Jenkins tests

Signed-off-by: amitraj <quic_amitraj@quicinc.com>

* Addressed comments

Signed-off-by: amitraj <quic_amitraj@quicinc.com>

* updated qaic tests time upper limit to 60 minutes

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>

---------
Signed-off-by: amitraj <quic_amitraj@quicinc.com>

* Added support for Embedding moodels
---
 QEfficient/__init__.py                        |  3 +-
 .../generation/text_generation_inference.py   | 24 +++++-
 .../transformers/models/modeling_auto.py      | 84 ++++++++++++++++---
 .../transformers/models/pytorch_transforms.py |  3 +
 QEfficient/transformers/quantizers/auto.py    |  1 -
 .../models/test_causal_lm_models.py           | 59 ++++++++++++-
 6 files changed, 159 insertions(+), 15 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 0f7f40483..987399316 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -8,7 +8,7 @@
 from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
 from QEfficient.compile.compile_helper import compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_embed, cloud_ai_100_exec_kv
 from QEfficient.peft import QEffAutoPeftModelForCausalLM
 from QEfficient.transformers.transform import transform
 
@@ -21,6 +21,7 @@
     "export",
     "compile",
     "cloud_ai_100_exec_kv",
+    "cloud_ai_100_exec_embed",
     "QEffAutoModel",
     "QEFFAutoModelForCausalLM",
     "QEffAutoPeftModelForCausalLM",
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 4ddd57ada..3f8692dbe 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -347,7 +347,29 @@ def cloud_ai_100_exec_kv(
     return exec_info
 
 
-class QEffTextGenerationBase:
+def cloud_ai_100_exec_embed(
+    tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
+    prompt: List[str],
+    qpc_path: str,
+    device_id: List[int] = [0],
+):
+    session = QAICInferenceSession(qpc_path, device_ids=device_id)
+    seq_len = session.bindings[0].dims[1]
+    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
+
+    prefill_inputs = dict(
+        input_ids=inputs["input_ids"].numpy(),
+        attention_mask=inputs["attention_mask"].numpy(),
+    )
+    prefill_logits = {
+        "output": np.random.randn(1, seq_len, session.bindings[2].dims[2]).astype(np.float32),
+    }
+    session.set_buffers(prefill_logits)
+    prefill_outputs = session.run(prefill_inputs)
+    return prefill_outputs
+
+
+class TextGeneration:
     def __init__(
         self,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index d0bb4285f..719cdee3c 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -9,7 +9,7 @@
 import logging
 import warnings
 from pathlib import Path
-from typing import Any, List, Optional, Union
+from typing import List, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -35,9 +35,6 @@ class QEFFTransformersBase(QEFFBaseModel):
     _hf_auto_class: type
 
     def __init__(self, model: nn.Module) -> None:
-        model_class_name = model.__class__.__name__
-        if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")):
-            raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}")
 
         if hasattr(model.config, "quantization_config") and not isinstance(
             model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values())
@@ -386,11 +383,78 @@ def generate(
 
 class QEffAutoModel(QEFFTransformersBase):
     _hf_auto_class = AutoModel
-    _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform]
-    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
+    _pytorch_transforms = [CustomOpsTransform]
+    _onnx_transforms = [FP16ClipTransform]
+
+    def __init__(self, model: nn.Module, continuous_batching: bool = False, **kwargs):
+        super().__init__(model)
+        self.model.config.use_cache = True
+        self.num_layers = model.config.num_hidden_layers
+
+    def export(self, export_dir: Optional[str] = None) -> str:
+        seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
+
+        example_inputs = {
+            "input_ids": torch.zeros((1, seq_len), dtype=torch.int64),
+            "attention_mask": torch.ones((1, seq_len), dtype=torch.int64),
+        }
+
+        dynamic_axes = {"input_ids": {1: "seq_len"}, "attention_mask": {1: "seq_len"}}
+
+        output_names = ["output"]
+
+        return self._export(
+            example_inputs,
+            output_names,
+            dynamic_axes,
+            export_dir=export_dir,
+        )
+
+    def compile(
+        self,
+        onnx_path: Optional[str] = None,
+        compile_dir: Optional[str] = None,
+        *,
+        seq_len: int = 32,
+        num_cores: int = 14,  # FIXME: Make this mandatory arg
+        **compiler_options,
+    ) -> str:
+        specializations = [
+            {"seq_len": seq_len},
+        ]
+
+        return self._compile(
+            onnx_path,
+            compile_dir,
+            compile_only=True,
+            specializations=specializations,
+            convert_to_fp16=True,
+            aic_num_cores=num_cores,
+            **compiler_options,
+        )
+
+    def generate(
+        self,
+        tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
+        prompt: List[str],
+        device_id: List[int] = [0],
+        runtime_ai100: bool = True,
+        seq_len: int = constants.Constants.CTX_LEN,
+    ):
+        if runtime_ai100:
+            if not isinstance(self.qpc_path, Path):
+                raise TypeError("Please run compile API first!")        
+
+            return QEfficient.cloud_ai_100_exec_embed(
+                tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id
+            )
+        else:
+            inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
+            return self.model(**inputs)
+            
 
-    def export(self):
-        raise NotImplementedError("Reached too far!!")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
-    def compile(self, *args, **kwargs) -> Any:
-        raise NotImplementedError("Reached too far!!")
+        return self
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 6b8d00689..6a15befcf 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -305,6 +305,9 @@ class KVCacheTransform(ModuleMappingTransform):
 
     @classmethod
     def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
+        import ipdb
+
+        ipdb.set_trace()
         model, transformed = super().apply(model)
         # FIXME: see if we can merge into _module_mapping dict
         transformers.cache_utils.DynamicCache.update = QEffDynamicCache.update
diff --git a/QEfficient/transformers/quantizers/auto.py b/QEfficient/transformers/quantizers/auto.py
index f4cec3b54..b5b4be099 100644
--- a/QEfficient/transformers/quantizers/auto.py
+++ b/QEfficient/transformers/quantizers/auto.py
@@ -38,7 +38,6 @@ def wrapper(*args, **kwargs):
 
         # Put back quantization config and quantizer
         for k in QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
-            AUTO_QUANTIZATION_CONFIG_MAPPING[k] = transformers_replaced_quantization_config_mapping[k]
             AUTO_QUANTIZER_MAPPING[k] = transformers_replaced_quantizer_mapping[k]
 
         return out
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 6e91711e0..de0aec0ca 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -8,11 +8,12 @@
 from typing import Optional
 
 import numpy as np
+import onnxruntime as ort
 import pytest
-from transformers import AutoModelForCausalLM
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
 
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM
 from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
 from QEfficient.utils import hf_download
 from QEfficient.utils._utils import load_hf_tokenizer
@@ -179,6 +180,55 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
 
 
+def check_embed_pytorch_vs_ort_vs_ai100(
+    model_name: str,
+    seq_len: int = Constants.CTX_LEN,
+    n_layer: int = 1,
+):
+    model_path = hf_download(
+        repo_id=model_name,
+        ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
+    )
+
+    # Try to initialize with add_pooling_layer parameter
+    try:
+        model = AutoModel.from_pretrained(model_name, add_pooling_layer=False)
+        qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path, add_pooling_layer=False)
+    except TypeError:
+        # If it fails, initialize without the parameter
+        model = AutoModel.from_pretrained(model_name)
+        qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path)
+    text = "My name is"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=seq_len)
+
+    pt_outputs=qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False)
+
+    onnx_model = qeff_model.export()
+    ort_session = ort.InferenceSession(str(onnx_model))
+    # Prepare the inputs for ONNX Runtime
+    onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()}
+    # Run inference
+    onnx_outputs = ort_session.run(None, onnx_inputs)
+
+    # Compare PyTorch and ONNX outputs
+    pt_embeddings = pt_outputs[0].detach().numpy()
+    onnx_embeddings = onnx_outputs[0]
+    mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
+    print("Mad for onnx and pytorch is ", mad)
+    assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
+
+    qeff_model.compile(
+        num_cores=14,
+    )
+    ai100_output = qeff_model.generate(tokenizer=tokenizer, prompt=["My name is"])
+
+    # Compare ONNX and AI 100 outputs
+    mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
+    print("Mad for onnx and AI 100 output is ", mad)
+    assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
+
+
 # FIXME: there should be a CB test here
 @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x)
 def test_causal_lm_export_with_deprecated_api(model_name):
@@ -252,3 +302,8 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
     prompt_len = 1
 
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
+
+@pytest.mark.on_qaic
+def test_embed_model_pytorch_vs_onnx_vs_ai100():
+    model_name = "BAAI/bge-small-en-v1.5"
+    check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)

From e83b29a851f4171f8e8e72474790eb5960f59ef4 Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Mon, 9 Dec 2024 10:25:35 +0530
Subject: [PATCH 02/17] Added support for embedding models

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 QEfficient/generation/text_generation_inference.py | 2 +-
 QEfficient/transformers/models/modeling_auto.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 3f8692dbe..eaa62a926 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -369,7 +369,7 @@ def cloud_ai_100_exec_embed(
     return prefill_outputs
 
 
-class TextGeneration:
+class QEffTextGenerationBase:
     def __init__(
         self,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 719cdee3c..578bef184 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -440,7 +440,7 @@ def generate(
         device_id: List[int] = [0],
         runtime_ai100: bool = True,
         seq_len: int = constants.Constants.CTX_LEN,
-    ):
+    ) -> str:
         if runtime_ai100:
             if not isinstance(self.qpc_path, Path):
                 raise TypeError("Please run compile API first!")        

From be592874643d276313aa91ac2907c991dbbd0bb3 Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Mon, 9 Dec 2024 10:53:16 +0530
Subject: [PATCH 03/17] Lint & Format

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 QEfficient/transformers/models/modeling_auto.py      | 4 +---
 QEfficient/transformers/models/pytorch_transforms.py | 3 ---
 tests/transformers/models/test_causal_lm_models.py   | 7 +++----
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 578bef184..a803906df 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -35,7 +35,6 @@ class QEFFTransformersBase(QEFFBaseModel):
     _hf_auto_class: type
 
     def __init__(self, model: nn.Module) -> None:
-
         if hasattr(model.config, "quantization_config") and not isinstance(
             model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values())
         ):
@@ -443,7 +442,7 @@ def generate(
     ) -> str:
         if runtime_ai100:
             if not isinstance(self.qpc_path, Path):
-                raise TypeError("Please run compile API first!")        
+                raise TypeError("Please run compile API first!")
 
             return QEfficient.cloud_ai_100_exec_embed(
                 tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id
@@ -451,7 +450,6 @@ def generate(
         else:
             inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
             return self.model(**inputs)
-            
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 6a15befcf..6b8d00689 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -305,9 +305,6 @@ class KVCacheTransform(ModuleMappingTransform):
 
     @classmethod
     def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
-        import ipdb
-
-        ipdb.set_trace()
         model, transformed = super().apply(model)
         # FIXME: see if we can merge into _module_mapping dict
         transformers.cache_utils.DynamicCache.update = QEffDynamicCache.update
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index de0aec0ca..b765ae609 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -10,7 +10,7 @@
 import numpy as np
 import onnxruntime as ort
 import pytest
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM
@@ -192,17 +192,15 @@ def check_embed_pytorch_vs_ort_vs_ai100(
 
     # Try to initialize with add_pooling_layer parameter
     try:
-        model = AutoModel.from_pretrained(model_name, add_pooling_layer=False)
         qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path, add_pooling_layer=False)
     except TypeError:
         # If it fails, initialize without the parameter
-        model = AutoModel.from_pretrained(model_name)
         qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path)
     text = "My name is"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=seq_len)
 
-    pt_outputs=qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False)
+    pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False)
 
     onnx_model = qeff_model.export()
     ort_session = ort.InferenceSession(str(onnx_model))
@@ -303,6 +301,7 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
 
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
 
+
 @pytest.mark.on_qaic
 def test_embed_model_pytorch_vs_onnx_vs_ai100():
     model_name = "BAAI/bge-small-en-v1.5"

From 12841558ecbea676928da9b0a72fd85d7be0ebb8 Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Mon, 9 Dec 2024 11:34:03 +0530
Subject: [PATCH 04/17] Added batch_size

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 .../transformers/models/modeling_auto.py      | 31 +++++++++++++++----
 QEfficient/transformers/quantizers/auto.py    |  1 +
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index a803906df..02b6e55e2 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -272,7 +272,7 @@ def compile(
             :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``.
             :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``.
             :full_batch_size (int, optional): Continuous batching batch size.
-            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
+            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
             :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
@@ -381,6 +381,23 @@ def generate(
 
 
 class QEffAutoModel(QEFFTransformersBase):
+    """
+    The QEffAutoModel class is designed for manipulating any transformer model from the HuggingFace hub.
+    Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization.
+
+    ``Mandatory`` Args:
+        :model (nn.Module): PyTorch model
+
+    .. code-block:: python
+
+        from QEfficient import QEffAutoModel
+
+        model = QEffAutoModel.from_pretrained(model_name, num_hidden_layers=2)
+        model.compile(prefill_seq_len=32, ctx_len=1024)
+
+        model.generate(prompts=["Hello, world!"])
+    """
+
     _hf_auto_class = AutoModel
     _pytorch_transforms = [CustomOpsTransform]
     _onnx_transforms = [FP16ClipTransform]
@@ -391,14 +408,15 @@ def __init__(self, model: nn.Module, continuous_batching: bool = False, **kwargs
         self.num_layers = model.config.num_hidden_layers
 
     def export(self, export_dir: Optional[str] = None) -> str:
+        bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
         seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
 
         example_inputs = {
-            "input_ids": torch.zeros((1, seq_len), dtype=torch.int64),
-            "attention_mask": torch.ones((1, seq_len), dtype=torch.int64),
+            "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64),
+            "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64),
         }
 
-        dynamic_axes = {"input_ids": {1: "seq_len"}, "attention_mask": {1: "seq_len"}}
+        dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "attention_mask": {0: "batch_size", 1: "seq_len"}}
 
         output_names = ["output"]
 
@@ -415,11 +433,12 @@ def compile(
         compile_dir: Optional[str] = None,
         *,
         seq_len: int = 32,
-        num_cores: int = 14,  # FIXME: Make this mandatory arg
+        batch_size: int = 1,
+        num_cores: int = 16,  # FIXME: Make this mandatory arg
         **compiler_options,
     ) -> str:
         specializations = [
-            {"seq_len": seq_len},
+            {"batch_size": batch_size, "seq_len": seq_len},
         ]
 
         return self._compile(
diff --git a/QEfficient/transformers/quantizers/auto.py b/QEfficient/transformers/quantizers/auto.py
index b5b4be099..f4cec3b54 100644
--- a/QEfficient/transformers/quantizers/auto.py
+++ b/QEfficient/transformers/quantizers/auto.py
@@ -38,6 +38,7 @@ def wrapper(*args, **kwargs):
 
         # Put back quantization config and quantizer
         for k in QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
+            AUTO_QUANTIZATION_CONFIG_MAPPING[k] = transformers_replaced_quantization_config_mapping[k]
             AUTO_QUANTIZER_MAPPING[k] = transformers_replaced_quantizer_mapping[k]
 
         return out

From 3f95df74144a981d641f9f66427df0a82b636604 Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Mon, 9 Dec 2024 12:11:27 +0530
Subject: [PATCH 05/17] Docstring added

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 .../transformers/models/modeling_auto.py      | 79 +++++++++++++++++--
 1 file changed, 71 insertions(+), 8 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 02b6e55e2..9cdfd47fc 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -186,7 +186,6 @@ def model_hash(self) -> str:
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
-        We currently don't support exporting non-transformed models. Please refer to the ``convert_to_cloud_bertstyle`` function in the **Low-Level API** for a legacy function that supports this."
 
         ``Optional`` Args:
             :export_dir (str, optional): The directory path to store ONNX-graph.
@@ -393,7 +392,7 @@ class QEffAutoModel(QEFFTransformersBase):
         from QEfficient import QEffAutoModel
 
         model = QEffAutoModel.from_pretrained(model_name, num_hidden_layers=2)
-        model.compile(prefill_seq_len=32, ctx_len=1024)
+        model.compile()
 
         model.generate(prompts=["Hello, world!"])
     """
@@ -402,12 +401,49 @@ class QEffAutoModel(QEFFTransformersBase):
     _pytorch_transforms = [CustomOpsTransform]
     _onnx_transforms = [FP16ClipTransform]
 
-    def __init__(self, model: nn.Module, continuous_batching: bool = False, **kwargs):
+    def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
         self.model.config.use_cache = True
         self.num_layers = model.config.num_hidden_layers
+    
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        """
+        This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel.
+        Once the model is initialized, you can use other methods such as export, compile, and generate on the same object.
+
+        Args:
+            :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory.
+            :args, kwargs: Additional arguments to pass to transformers.AutoModel.
+
+        .. code-block:: python
+
+            from QEfficient import QEFFAutoModel
+
+            # Initialize the model using from_pretrained similar to transformers.AutoModel.
+            model = QEFFAutoModel.from_pretrained("BAAI/bge-small-en-v1.5")
+
+            # Now you can directly compile the model for Cloud AI 100
+            model.compile(num_cores=14, device_group=[0])  # Considering you have a Cloud AI 100 Standard SKU
+
+            # You can now execute the model
+            model.generate(prompts=["Hi there!!"])
+        """
+        
+        self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+
+        return self
 
     def export(self, export_dir: Optional[str] = None) -> str:
+        """
+        Exports the model to ``ONNX`` format using ``torch.onnx.export``.
+
+        ``Optional`` Args:
+            does not any arguments.
+
+        Returns:
+            :str: Path of the generated ``ONNX`` graph.
+        """
         bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
         seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
 
@@ -437,6 +473,21 @@ def compile(
         num_cores: int = 16,  # FIXME: Make this mandatory arg
         **compiler_options,
     ) -> str:
+        """
+        This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package.
+        If the model has not been exported yet, this method will handle the export process.
+        You can pass any other arguments that the `qaic-exec` takes as extra kwargs.
+
+        ``Optional`` Args:
+            :onnx_path (str, optional): Path to pre-exported onnx model.
+            :compile_dir (str, optional): Path for saving the qpc generated.
+            :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
+            :batch_size (int, optional): Batch size. ``Defaults to 1``.
+            :num_cores (int): Number of cores used to compile the model.
+        Returns:
+            :str: Path of the compiled ``qpc`` package.
+        """
+
         specializations = [
             {"batch_size": batch_size, "seq_len": seq_len},
         ]
@@ -459,6 +510,22 @@ def generate(
         runtime_ai100: bool = True,
         seq_len: int = constants.Constants.CTX_LEN,
     ) -> str:
+        """
+        This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
+        This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed.
+        If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped.
+
+        ``Mandatory`` Args:
+            :prompts (List[str]): List of prompts to run the execution.
+            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
+        ``optional`` Args:
+            :runtime_ai100 (bool), optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+
+        Returns:
+            :str: Output from the ``AI_100`` or ``PyTorch`` runtime.
+        """
+
+        # AI_100 runtime
         if runtime_ai100:
             if not isinstance(self.qpc_path, Path):
                 raise TypeError("Please run compile API first!")
@@ -466,12 +533,8 @@ def generate(
             return QEfficient.cloud_ai_100_exec_embed(
                 tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id
             )
+        # PyTorch runtime
         else:
             inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
             return self.model(**inputs)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-        self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-
-        return self

From 74ffc16f36cf2d5ac0ed0a8d23906302cda63e80 Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Mon, 9 Dec 2024 13:39:40 +0530
Subject: [PATCH 06/17] Fix-1

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 .../generation/text_generation_inference.py   | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index eaa62a926..9b65b80ee 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -174,7 +174,8 @@ def get_compilation_dims(qpc_path: str) -> Tuple[int, int, Optional[int]]:
         raise FileNotFoundError(f"expected specializations.json file at path, {qpc_base_path}")
 
     compilation_batch_size = int(data["specializations"][0]["batch_size"])
-    compilation_ctx_len = int(data["specializations"][0]["ctx_len"])
+    if compilation_ctx_len := data["specializations"][0].get("ctx_len", None):
+        compilation_ctx_len = int(data["specializations"][0]["ctx_len"])
     if compilation_fbs := data["specializations"][0].get("full_batch_size", None):
         compilation_fbs = int(compilation_fbs)
     return compilation_batch_size, compilation_ctx_len, compilation_fbs
@@ -349,25 +350,25 @@ def cloud_ai_100_exec_kv(
 
 def cloud_ai_100_exec_embed(
     tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
-    prompt: List[str],
     qpc_path: str,
-    device_id: List[int] = [0],
+    prompt: List[str],
+    device_id: List[int] = [0],    
 ):
     session = QAICInferenceSession(qpc_path, device_ids=device_id)
+    batch_size = session.bindings[0].dims[0]
     seq_len = session.bindings[0].dims[1]
     inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
 
-    prefill_inputs = dict(
+    inputs = dict(
         input_ids=inputs["input_ids"].numpy(),
         attention_mask=inputs["attention_mask"].numpy(),
     )
-    prefill_logits = {
-        "output": np.random.randn(1, seq_len, session.bindings[2].dims[2]).astype(np.float32),
+    output = {
+        "output": np.random.randn(batch_size, seq_len, session.bindings[2].dims[2]).astype(np.float32),
     }
-    session.set_buffers(prefill_logits)
-    prefill_outputs = session.run(prefill_inputs)
-    return prefill_outputs
-
+    session.set_buffers(output)
+    outputs = session.run(inputs)
+    return outputs
 
 class QEffTextGenerationBase:
     def __init__(

From 2fb41ad108da8cad886617b4a8f56117d942b651 Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Mon, 9 Dec 2024 14:38:40 +0530
Subject: [PATCH 07/17] Comments Addressed-1

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 .../generation/text_generation_inference.py   | 25 +++++-
 .../transformers/models/modeling_auto.py      | 88 ++++++++++---------
 QEfficient/utils/constants.py                 |  2 +-
 .../models/test_causal_lm_models.py           | 45 ++++++++--
 4 files changed, 105 insertions(+), 55 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 9b65b80ee..5fa55ed9a 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -174,8 +174,7 @@ def get_compilation_dims(qpc_path: str) -> Tuple[int, int, Optional[int]]:
         raise FileNotFoundError(f"expected specializations.json file at path, {qpc_base_path}")
 
     compilation_batch_size = int(data["specializations"][0]["batch_size"])
-    if compilation_ctx_len := data["specializations"][0].get("ctx_len", None):
-        compilation_ctx_len = int(data["specializations"][0]["ctx_len"])
+    compilation_ctx_len = int(data["specializations"][0]["ctx_len"])
     if compilation_fbs := data["specializations"][0].get("full_batch_size", None):
         compilation_fbs = int(compilation_fbs)
     return compilation_batch_size, compilation_ctx_len, compilation_fbs
@@ -352,8 +351,24 @@ def cloud_ai_100_exec_embed(
     tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
     qpc_path: str,
     prompt: List[str],
-    device_id: List[int] = [0],    
-):
+    device_id: List[int] = [0],
+) -> dict:
+    """
+    This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
+    This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed.
+    If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped.
+
+    ``Mandatory`` Args:
+        :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer.
+        :qpc_path (str): Path to the saved generated binary file after compilation.
+        :prompt (str): Sample prompt for the model text generation.
+    ``Optional`` Args:
+        :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
+
+    Returns:
+        :dict: Output from the ``AI_100`` runtime.
+    """
+
     session = QAICInferenceSession(qpc_path, device_ids=device_id)
     batch_size = session.bindings[0].dims[0]
     seq_len = session.bindings[0].dims[1]
@@ -368,8 +383,10 @@ def cloud_ai_100_exec_embed(
     }
     session.set_buffers(output)
     outputs = session.run(inputs)
+    session.deactivate()
     return outputs
 
+
 class QEffTextGenerationBase:
     def __init__(
         self,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 9cdfd47fc..05b62dcb6 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -66,22 +66,6 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
-    @property
-    def model_hash(self) -> str:
-        # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path.
-        # Using same card name will result in same hash. But, using a relative path for one run and
-        # absolute path for another run will result in different hash.
-        # The added complexity to resolve different paths to same location is not worth pursuing.
-        # Instead, advise the user to always provide same relative paths or absolute paths for local models.
-
-        # Compute the hash with: model_config, transforms
-        mhash = hashlib.sha256()
-        mhash.update(to_hashable(self.model.config.to_diff_dict()))
-        mhash.update(to_hashable(self._transform_names()))
-        mhash.update(to_hashable({"is_tlm": self.is_tlm}))
-        mhash = mhash.hexdigest()[:16]
-        return mhash
-
 
 class QEFFAutoModelForCausalLM(QEFFTransformersBase):
     """
@@ -349,8 +333,9 @@ def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
-        device_id: List[int] = None,
-        runtime: str = "AI_100",
+        device_id: List[int] = [0],
+        runtime_ai100: bool = True,
+        seq_len: int = constants.Constants.CTX_LEN,
         **kwargs,
     ):
         """
@@ -362,21 +347,24 @@ def generate(
             :prompts (List[str]): List of prompts to run the execution.
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         ``optional`` Args:
-            :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100".
+            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+
         """
-        if runtime != "AI_100":
-            raise ValueError("Only AI_100 runtime is supported right now via generate API")
-        if not isinstance(self.qpc_path, Path):
-            raise TypeError("Please run compile API first!")
-        generation_len = kwargs.pop("generation_len", None)
-        return QEfficient.cloud_ai_100_exec_kv(
-            tokenizer,
-            self.qpc_path,
-            prompt=prompts,
-            device_id=device_id,
-            generation_len=generation_len,
-            is_tlm=self.is_tlm,
-        )
+        if runtime_ai100:
+            if not isinstance(self.qpc_path, Path):
+                raise TypeError("Please run compile API first!")
+            generation_len = kwargs.pop("generation_len", None)
+            return QEfficient.cloud_ai_100_exec_kv(
+                tokenizer,
+                self.qpc_path,
+                prompt=prompts,
+                device_id=device_id,
+                generation_len=generation_len,
+                is_tlm=self.is_tlm,
+            )
+        else:
+            inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len)
+            return self.model(**inputs)
 
 
 class QEffAutoModel(QEFFTransformersBase):
@@ -405,7 +393,7 @@ def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
         self.model.config.use_cache = True
         self.num_layers = model.config.num_hidden_layers
-    
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         """
@@ -429,11 +417,26 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             # You can now execute the model
             model.generate(prompts=["Hi there!!"])
         """
-        
+
         self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
         return self
 
+    @property
+    def model_hash(self) -> str:
+        # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path.
+        # Using same card name will result in same hash. But, using a relative path for one run and
+        # absolute path for another run will result in different hash.
+        # The added complexity to resolve different paths to same location is not worth pursuing.
+        # Instead, advise the user to always provide same relative paths or absolute paths for local models.
+
+        # Compute the hash with: model_config, transforms
+        mhash = hashlib.sha256()
+        mhash.update(to_hashable(self.model.config.to_diff_dict()))
+        mhash.update(to_hashable(self._transform_names()))
+        mhash = mhash.hexdigest()[:16]
+        return mhash
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
@@ -470,7 +473,9 @@ def compile(
         *,
         seq_len: int = 32,
         batch_size: int = 1,
+        num_devices: int = 1,
         num_cores: int = 16,  # FIXME: Make this mandatory arg
+        mxfp6_matmul: bool = False,
         **compiler_options,
     ) -> str:
         """
@@ -498,6 +503,8 @@ def compile(
             compile_only=True,
             specializations=specializations,
             convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
             **compiler_options,
         )
@@ -505,11 +512,11 @@ def compile(
     def generate(
         self,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
-        prompt: List[str],
+        prompts: List[str],
         device_id: List[int] = [0],
         runtime_ai100: bool = True,
         seq_len: int = constants.Constants.CTX_LEN,
-    ) -> str:
+    ) -> dict:
         """
         This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
         This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed.
@@ -519,10 +526,10 @@ def generate(
             :prompts (List[str]): List of prompts to run the execution.
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         ``optional`` Args:
-            :runtime_ai100 (bool), optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
 
         Returns:
-            :str: Output from the ``AI_100`` or ``PyTorch`` runtime.
+            :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
         """
 
         # AI_100 runtime
@@ -531,10 +538,9 @@ def generate(
                 raise TypeError("Please run compile API first!")
 
             return QEfficient.cloud_ai_100_exec_embed(
-                tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id
+                tokenizer=tokenizer, prompt=prompts, qpc_path=self.qpc_path, device_id=device_id
             )
         # PyTorch runtime
         else:
-            inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
+            inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len)
             return self.model(**inputs)
-
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 4a3ba3ff3..5e3a29072 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -47,7 +47,7 @@ def get_models_dir():
 ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32
 ONNX_EXPORT_EXAMPLE_FBS = 4
 ONNX_EXPORT_EXAMPLE_NLK = 2  # Number of Logits to Keep
-ONNX_EXPORT_OPSET = 13
+ONNX_EXPORT_OPSET = 14
 
 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]
 
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index b765ae609..be17732a7 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -15,7 +15,7 @@
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM
 from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
-from QEfficient.utils import hf_download
+from QEfficient.utils import hf_download, padding_check_and_fix
 from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.device_utils import get_available_device_id
@@ -192,13 +192,26 @@ def check_embed_pytorch_vs_ort_vs_ai100(
 
     # Try to initialize with add_pooling_layer parameter
     try:
-        qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path, add_pooling_layer=False)
+        qeff_model = QEffAutoModel.from_pretrained(
+            pretrained_model_name_or_path=model_path,
+            add_pooling_layer=False,
+            num_hidden_layers=n_layer,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
     except TypeError:
         # If it fails, initialize without the parameter
-        qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path)
-    text = "My name is"
+        qeff_model = QEffAutoModel.from_pretrained(
+            pretrained_model_name_or_path=model_path,
+            num_hidden_layers=n_layer,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
+
+    prompt = "My name is"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=seq_len)
+    padding_check_and_fix(tokenizer)
+    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
 
     pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False)
 
@@ -214,7 +227,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     onnx_embeddings = onnx_outputs[0]
     mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
     print("Mad for onnx and pytorch is ", mad)
-    assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
+    assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
 
     qeff_model.compile(
         num_cores=14,
@@ -224,7 +237,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     # Compare ONNX and AI 100 outputs
     mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
-    assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}"
+    assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
 
 
 # FIXME: there should be a CB test here
@@ -302,7 +315,21 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
 
 
+embed_test_models = [
+    # model_name, architecture
+    "nomic-ai/nomic-embed-text-v1.5",  # NomicBertModel
+    "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
+    "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
+    "BAAI/bge-small-en-v1.5",  # BertModel
+    # "intfloat/e5-mistral-7b-instruct",  # MistralModel
+    # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM
+]
+
+
 @pytest.mark.on_qaic
-def test_embed_model_pytorch_vs_onnx_vs_ai100():
-    model_name = "BAAI/bge-small-en-v1.5"
+@pytest.mark.parametrize("model_name", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
+    """
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+    """
     check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)

From 262f45ebb4c21e793aad326b03017bd77b63b23e Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Sun, 15 Dec 2024 11:36:11 +0530
Subject: [PATCH 08/17] Comments addressed-2

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 .../generation/text_generation_inference.py   | 102 +++++++++++++++---
 .../transformers/models/modeling_auto.py      |  21 ++--
 .../models/test_causal_lm_models.py           |  82 +-------------
 .../models/test_embedding_models.py           |  95 ++++++++++++++++
 4 files changed, 196 insertions(+), 104 deletions(-)
 create mode 100644 tests/transformers/models/test_embedding_models.py

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 5fa55ed9a..40328f725 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -350,8 +350,9 @@ def cloud_ai_100_exec_kv(
 def cloud_ai_100_exec_embed(
     tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
     qpc_path: str,
-    prompt: List[str],
+    prompts: List[str],
     device_id: List[int] = [0],
+    enable_debug_logs: bool = False,
 ) -> dict:
     """
     This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
@@ -368,23 +369,14 @@ def cloud_ai_100_exec_embed(
     Returns:
         :dict: Output from the ``AI_100`` runtime.
     """
-
-    session = QAICInferenceSession(qpc_path, device_ids=device_id)
-    batch_size = session.bindings[0].dims[0]
-    seq_len = session.bindings[0].dims[1]
-    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
-
-    inputs = dict(
-        input_ids=inputs["input_ids"].numpy(),
-        attention_mask=inputs["attention_mask"].numpy(),
+    generate_feature=FeatureGeneration(
+        tokenizer=tokenizer,
+        qpc_path=qpc_path,
+        device_id=device_id,
+        enable_debug_logs=enable_debug_logs,
     )
-    output = {
-        "output": np.random.randn(batch_size, seq_len, session.bindings[2].dims[2]).astype(np.float32),
-    }
-    session.set_buffers(output)
-    outputs = session.run(inputs)
-    session.deactivate()
-    return outputs
+    
+    return generate_feature.generate(prompts=prompts)
 
 
 class QEffTextGenerationBase:
@@ -406,6 +398,7 @@ def __init__(
         # Load QPC
         self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs)
 
+
         # Fetch the variables from the QPC
         self._vocab_size = self._fetch_vocab_size()  # Fetch Vocab size
         self.batch_size, self._prefill_seq_len = self._fetch_batch_size_prefill_seq_len()
@@ -1110,3 +1103,78 @@ def generate(
             perf_metrics=perf_metrics,
         )
         return latency_stats
+    
+class QEffFeatureGenerationBase:
+        
+    def __init__(
+        self,
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        qpc_path: str,
+        ctx_len: Optional[int] = None,
+        device_id: Optional[List[int]] = None,
+        enable_debug_logs: bool = False,
+    ) -> None:
+        self.ctx_len = ctx_len
+        
+        # Load QPC
+        self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs)
+        
+        self._batch_size = self._session.bindings[0].dims[0]
+        self._seq_len = self._session.bindings[0].dims[1]
+    
+        self.tokenizer = tokenizer
+        self._set_tokenizer_params()  # set tokenizer params
+    
+    def _set_tokenizer_params(self):
+        """
+        Sets the tokenizer parameters for the model.
+        """
+        if self.tokenizer.padding_side != "right":
+            logger.warning("Please use padding_side='right' while initializing the tokenizer")
+            self.tokenizer.padding_side = "right"
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+    
+    
+class FeatureGeneration:
+    def __init__(
+        self,
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        qpc_path: str,
+        seq_len: Optional[int] = None,
+        device_id: Optional[List[int]] = None,
+        enable_debug_logs: bool = False,
+    ) -> None:
+        
+        self._qaic_model = QEffFeatureGenerationBase(
+        tokenizer, qpc_path, seq_len, device_id, enable_debug_logs
+        )
+        self._batch_size = self._qaic_model._batch_size
+        self._tokenizer = self._qaic_model.tokenizer
+        self._seq_len = self._qaic_model._seq_len
+        self._session = self._qaic_model._session        
+    def generate(
+        self,
+        prompts: List[str]
+    ):
+        outputs = []
+
+        for prompt in prompts:
+            inputs = self._tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=self._seq_len)
+
+            inputs = dict(
+                input_ids=inputs["input_ids"].numpy(),
+                attention_mask=inputs["attention_mask"].numpy(),
+            )
+            output = {
+                "output": np.random.randn(self._batch_size, self._seq_len, self._session.bindings[2].dims[2]).astype(
+                    np.float32
+                ),
+            }
+            self._session.set_buffers(output)
+            output = self._session.run(inputs)
+            outputs.append(output)
+        return outputs
+        
+        
+        
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 05b62dcb6..027ca54eb 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -335,7 +335,6 @@ def generate(
         prompts: List[str],
         device_id: List[int] = [0],
         runtime_ai100: bool = True,
-        seq_len: int = constants.Constants.CTX_LEN,
         **kwargs,
     ):
         """
@@ -363,8 +362,7 @@ def generate(
                 is_tlm=self.is_tlm,
             )
         else:
-            inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len)
-            return self.model(**inputs)
+            raise ValueError("Only AI_100 runtime is supported right now via generate API")
 
 
 class QEffAutoModel(QEFFTransformersBase):
@@ -395,6 +393,7 @@ def __init__(self, model: nn.Module, **kwargs):
         self.num_layers = model.config.num_hidden_layers
 
     @classmethod
+    @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         """
         This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel.
@@ -417,10 +416,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             # You can now execute the model
             model.generate(prompts=["Hi there!!"])
         """
+        if kwargs.get("attn_implementation", None) not in {None, "eager"}:
+            logger.warning('Updating attn_implementation="eager"')
 
-        self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        if kwargs.get("low_cpu_mem_usage", None):
+            logger.warning("Updating low_cpu_mem_usage=False")
 
-        return self
+        kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False})
+
+        try:
+            model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        except TypeError:
+            kwargs.pop("add_pooling_layers", None)
+            model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        return cls(model)
 
     @property
     def model_hash(self) -> str:
@@ -538,7 +547,7 @@ def generate(
                 raise TypeError("Please run compile API first!")
 
             return QEfficient.cloud_ai_100_exec_embed(
-                tokenizer=tokenizer, prompt=prompts, qpc_path=self.qpc_path, device_id=device_id
+                tokenizer=tokenizer, prompts=prompts, qpc_path=self.qpc_path, device_id=device_id
             )
         # PyTorch runtime
         else:
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index be17732a7..629828d55 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -179,67 +179,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         ]
     ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
 
-
-def check_embed_pytorch_vs_ort_vs_ai100(
-    model_name: str,
-    seq_len: int = Constants.CTX_LEN,
-    n_layer: int = 1,
-):
-    model_path = hf_download(
-        repo_id=model_name,
-        ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
-    )
-
-    # Try to initialize with add_pooling_layer parameter
-    try:
-        qeff_model = QEffAutoModel.from_pretrained(
-            pretrained_model_name_or_path=model_path,
-            add_pooling_layer=False,
-            num_hidden_layers=n_layer,
-            attn_implementation="eager",
-            trust_remote_code=True,
-        )
-    except TypeError:
-        # If it fails, initialize without the parameter
-        qeff_model = QEffAutoModel.from_pretrained(
-            pretrained_model_name_or_path=model_path,
-            num_hidden_layers=n_layer,
-            attn_implementation="eager",
-            trust_remote_code=True,
-        )
-
-    prompt = "My name is"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    padding_check_and_fix(tokenizer)
-    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
-
-    pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False)
-
-    onnx_model = qeff_model.export()
-    ort_session = ort.InferenceSession(str(onnx_model))
-    # Prepare the inputs for ONNX Runtime
-    onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()}
-    # Run inference
-    onnx_outputs = ort_session.run(None, onnx_inputs)
-
-    # Compare PyTorch and ONNX outputs
-    pt_embeddings = pt_outputs[0].detach().numpy()
-    onnx_embeddings = onnx_outputs[0]
-    mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
-    print("Mad for onnx and pytorch is ", mad)
-    assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
-
-    qeff_model.compile(
-        num_cores=14,
-    )
-    ai100_output = qeff_model.generate(tokenizer=tokenizer, prompt=["My name is"])
-
-    # Compare ONNX and AI 100 outputs
-    mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
-    print("Mad for onnx and AI 100 output is ", mad)
-    assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
-
-
 # FIXME: there should be a CB test here
 @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x)
 def test_causal_lm_export_with_deprecated_api(model_name):
@@ -313,23 +252,4 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
     prompt_len = 1
 
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
-
-
-embed_test_models = [
-    # model_name, architecture
-    "nomic-ai/nomic-embed-text-v1.5",  # NomicBertModel
-    "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
-    "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
-    "BAAI/bge-small-en-v1.5",  # BertModel
-    # "intfloat/e5-mistral-7b-instruct",  # MistralModel
-    # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM
-]
-
-
-@pytest.mark.on_qaic
-@pytest.mark.parametrize("model_name", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
-    """
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
-    """
-    check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)
+    
\ No newline at end of file
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
new file mode 100644
index 000000000..fe3ca7d62
--- /dev/null
+++ b/tests/transformers/models/test_embedding_models.py
@@ -0,0 +1,95 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+
+import numpy as np
+import onnxruntime as ort
+import pytest
+from transformers import AutoTokenizer
+
+from QEfficient.transformers.models.modeling_auto import QEffAutoModel
+from QEfficient.utils import hf_download, padding_check_and_fix
+from QEfficient.utils.constants import Constants
+
+embed_test_models = [
+    # model_name, architecture
+    "nomic-ai/nomic-embed-text-v1.5",  # NomicBertModel
+    "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
+    "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
+    "BAAI/bge-small-en-v1.5",  # BertModel
+    # "intfloat/e5-mistral-7b-instruct",  # MistralModel
+    # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM
+]
+
+def check_embed_pytorch_vs_ort_vs_ai100(
+    model_name: str,
+    seq_len: int = Constants.CTX_LEN,
+    n_layer: int = 1,
+):
+    model_path = hf_download(
+        repo_id=model_name,
+        ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
+    )
+
+    # Try to initialize with add_pooling_layer parameter
+    try:
+        qeff_model = QEffAutoModel.from_pretrained(
+            pretrained_model_name_or_path=model_path,
+            add_pooling_layer=False,
+            num_hidden_layers=n_layer,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
+    except TypeError:
+        # If it fails, initialize without the parameter
+        qeff_model = QEffAutoModel.from_pretrained(
+            pretrained_model_name_or_path=model_path,
+            num_hidden_layers=n_layer,
+            attn_implementation="eager",
+            trust_remote_code=True,
+        )
+
+    prompt = "My name is"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    padding_check_and_fix(tokenizer)
+    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
+
+    pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompts=["My name is"], runtime_ai100=False)
+
+    onnx_model = qeff_model.export()
+    ort_session = ort.InferenceSession(str(onnx_model))
+    # Prepare the inputs for ONNX Runtime
+    onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()}
+    # Run inference
+    onnx_outputs = ort_session.run(None, onnx_inputs)
+
+    # Compare PyTorch and ONNX outputs
+    pt_embeddings = pt_outputs[0][0].detach().numpy()
+    onnx_embeddings = onnx_outputs[0]
+    mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
+    print("Mad for onnx and pytorch is ", mad)
+    assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
+
+    qeff_model.compile(
+        num_cores=14,
+    )
+    ai100_output = qeff_model.generate(tokenizer=tokenizer, prompts=["My name is"])
+
+    # Compare ONNX and AI 100 outputs
+    mad = np.mean(np.abs(ai100_output[0]["output"] - onnx_outputs[0]))
+    print("Mad for onnx and AI 100 output is ", mad)
+    assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
+
+
+
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model_name", embed_test_models)
+def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):
+    """
+    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+    """
+    check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1)

From ba0258b69999ec8e8dc3c0d20ee709f6c066a0da Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Sun, 15 Dec 2024 11:37:35 +0530
Subject: [PATCH 09/17] Lint and formatted

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 .../generation/text_generation_inference.py   | 37 +++++++------------
 .../models/test_causal_lm_models.py           |  9 ++---
 .../models/test_embedding_models.py           |  2 +-
 3 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 40328f725..6d6377ed2 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -369,13 +369,13 @@ def cloud_ai_100_exec_embed(
     Returns:
         :dict: Output from the ``AI_100`` runtime.
     """
-    generate_feature=FeatureGeneration(
+    generate_feature = FeatureGeneration(
         tokenizer=tokenizer,
         qpc_path=qpc_path,
         device_id=device_id,
         enable_debug_logs=enable_debug_logs,
     )
-    
+
     return generate_feature.generate(prompts=prompts)
 
 
@@ -398,7 +398,6 @@ def __init__(
         # Load QPC
         self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs)
 
-
         # Fetch the variables from the QPC
         self._vocab_size = self._fetch_vocab_size()  # Fetch Vocab size
         self.batch_size, self._prefill_seq_len = self._fetch_batch_size_prefill_seq_len()
@@ -1103,9 +1102,9 @@ def generate(
             perf_metrics=perf_metrics,
         )
         return latency_stats
-    
+
+
 class QEffFeatureGenerationBase:
-        
     def __init__(
         self,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
@@ -1115,16 +1114,16 @@ def __init__(
         enable_debug_logs: bool = False,
     ) -> None:
         self.ctx_len = ctx_len
-        
+
         # Load QPC
         self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs)
-        
+
         self._batch_size = self._session.bindings[0].dims[0]
         self._seq_len = self._session.bindings[0].dims[1]
-    
+
         self.tokenizer = tokenizer
         self._set_tokenizer_params()  # set tokenizer params
-    
+
     def _set_tokenizer_params(self):
         """
         Sets the tokenizer parameters for the model.
@@ -1134,8 +1133,8 @@ def _set_tokenizer_params(self):
             self.tokenizer.padding_side = "right"
         if self.tokenizer.pad_token_id is None:
             self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-    
-    
+
+
 class FeatureGeneration:
     def __init__(
         self,
@@ -1145,18 +1144,13 @@ def __init__(
         device_id: Optional[List[int]] = None,
         enable_debug_logs: bool = False,
     ) -> None:
-        
-        self._qaic_model = QEffFeatureGenerationBase(
-        tokenizer, qpc_path, seq_len, device_id, enable_debug_logs
-        )
+        self._qaic_model = QEffFeatureGenerationBase(tokenizer, qpc_path, seq_len, device_id, enable_debug_logs)
         self._batch_size = self._qaic_model._batch_size
         self._tokenizer = self._qaic_model.tokenizer
         self._seq_len = self._qaic_model._seq_len
-        self._session = self._qaic_model._session        
-    def generate(
-        self,
-        prompts: List[str]
-    ):
+        self._session = self._qaic_model._session
+
+    def generate(self, prompts: List[str]):
         outputs = []
 
         for prompt in prompts:
@@ -1175,6 +1169,3 @@ def generate(
             output = self._session.run(inputs)
             outputs.append(output)
         return outputs
-        
-        
-        
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 629828d55..6e91711e0 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -8,14 +8,13 @@
 from typing import Optional
 
 import numpy as np
-import onnxruntime as ort
 import pytest
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM
 
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
-from QEfficient.utils import hf_download, padding_check_and_fix
+from QEfficient.utils import hf_download
 from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.constants import Constants
 from QEfficient.utils.device_utils import get_available_device_id
@@ -179,6 +178,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         ]
     ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
 
+
 # FIXME: there should be a CB test here
 @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x)
 def test_causal_lm_export_with_deprecated_api(model_name):
@@ -252,4 +252,3 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
     prompt_len = 1
 
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
-    
\ No newline at end of file
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index fe3ca7d62..4feb622a9 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -25,6 +25,7 @@
     # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM
 ]
 
+
 def check_embed_pytorch_vs_ort_vs_ai100(
     model_name: str,
     seq_len: int = Constants.CTX_LEN,
@@ -85,7 +86,6 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
 
 
-
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("model_name", embed_test_models)
 def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name):

From ba66c759576de397899c78d0e1b0a46ecf51255e Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Mon, 16 Dec 2024 10:23:27 +0530
Subject: [PATCH 10/17] Comments addressed-3

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 .../transformers/models/modeling_auto.py      |  3 +--
 pyproject.toml                                |  1 +
 .../models/test_embedding_models.py           | 25 +++++--------------
 3 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 027ca54eb..fd8d0acf7 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -423,11 +423,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False})
-
         try:
             model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
         except TypeError:
-            kwargs.pop("add_pooling_layers", None)
+            kwargs.pop("add_pooling_layer", None)
             model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
         return cls(model)
 
diff --git a/pyproject.toml b/pyproject.toml
index fbffbd317..a4d3c0be9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "numpy==1.23.0",
     "protobuf==3.20.2",
     "onnxscript==0.1.0.dev20240327",
+    "einops==0.8.0",
     "sympy",
     "torch==2.4.1; platform_machine=='aarch64'",
     # Specifying torch cpu package URL per python version, update the list once pytorch releases whl for python>3.11
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 4feb622a9..ab3ba7ba4 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -21,8 +21,6 @@
     "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
     "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
     "BAAI/bge-small-en-v1.5",  # BertModel
-    # "intfloat/e5-mistral-7b-instruct",  # MistralModel
-    # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM
 ]
 
 
@@ -36,23 +34,12 @@ def check_embed_pytorch_vs_ort_vs_ai100(
         ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
     )
 
-    # Try to initialize with add_pooling_layer parameter
-    try:
-        qeff_model = QEffAutoModel.from_pretrained(
-            pretrained_model_name_or_path=model_path,
-            add_pooling_layer=False,
-            num_hidden_layers=n_layer,
-            attn_implementation="eager",
-            trust_remote_code=True,
-        )
-    except TypeError:
-        # If it fails, initialize without the parameter
-        qeff_model = QEffAutoModel.from_pretrained(
-            pretrained_model_name_or_path=model_path,
-            num_hidden_layers=n_layer,
-            attn_implementation="eager",
-            trust_remote_code=True,
-        )
+    qeff_model = QEffAutoModel.from_pretrained(
+        pretrained_model_name_or_path=model_path,
+        num_hidden_layers=n_layer,
+        attn_implementation="eager",
+        trust_remote_code=True,
+    )
 
     prompt = "My name is"
     tokenizer = AutoTokenizer.from_pretrained(model_name)

From 38a418647703f3cd60c911bfbf2d6ff0db599e6d Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Mon, 16 Dec 2024 11:32:05 +0530
Subject: [PATCH 11/17] Fix-2

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 .../generation/text_generation_inference.py   | 32 ++++++++++++++++++-
 .../transformers/models/modeling_auto.py      |  8 +++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 6d6377ed2..e6f3ba3b0 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -17,7 +17,7 @@
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.utils import padding_check_and_fix
+from QEfficient.utils import constants, padding_check_and_fix
 from QEfficient.utils.logging_utils import logger
 
 
@@ -379,6 +379,36 @@ def cloud_ai_100_exec_embed(
     return generate_feature.generate(prompts=prompts)
 
 
+def pytorch_feature_generate(
+    model,
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    prompts: List[str],
+    seq_len: int = constants.Constants.CTX_LEN,
+):
+    """
+     Generates features from a list of text prompts using a PyTorch model and tokenizer.
+
+    ``Mandatory`` Args:
+         model: The PyTorch model used for generating features.
+         tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): The tokenizer used to preprocess the prompts.
+         prompts (List[str]): A list of text prompts to be tokenized and processed.
+     ``Optional`` Args:
+         seq_len (int, optional): The maximum sequence length for tokenization. Defaults to constants.Constants.CTX_LEN.
+
+     Returns:
+         List[torch.Tensor]: A list of output features generated by the model for each prompt.
+    """
+
+    outputs = []
+    for prompt in prompts:
+        inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
+        import ipdb
+
+        ipdb.set_trace()
+        outputs.append(model(**inputs))
+    return outputs
+
+
 class QEffTextGenerationBase:
     def __init__(
         self,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index fd8d0acf7..efb73fc82 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -18,6 +18,7 @@
 import QEfficient
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
+from QEfficient.generation.text_generation_inference import pytorch_feature_generate
 from QEfficient.transformers.models.pytorch_transforms import CustomOpsTransform, KVCacheTransform, SpDTransform
 from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
 from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform
@@ -99,6 +100,10 @@ def __init__(
         is_tlm: bool = False,
         **kwargs,
     ):
+        model_class_name = model.__class__.__name__
+        if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")):
+            raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}")
+
         # TODO: remove from version 1.20
         if kwargs.pop("full_batch_size", None):
             continuous_batching = True
@@ -550,5 +555,4 @@ def generate(
             )
         # PyTorch runtime
         else:
-            inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len)
-            return self.model(**inputs)
+            return pytorch_feature_generate(model=self.model, tokenizer=tokenizer, prompts=prompts, seq_len=seq_len)

From 4401fd63082f34ce3cb3957cb7152763eb5c482b Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Tue, 17 Dec 2024 13:34:00 +0530
Subject: [PATCH 12/17] Comments addressed-4

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 QEfficient/__init__.py                        |   3 +-
 .../generation/text_generation_inference.py   | 131 +-----------------
 .../transformers/models/modeling_auto.py      |  84 +++++++++--
 pyproject.toml                                |   1 -
 .../models/test_embedding_models.py           |  15 +-
 5 files changed, 83 insertions(+), 151 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 987399316..0f7f40483 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -8,7 +8,7 @@
 from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
 from QEfficient.compile.compile_helper import compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_embed, cloud_ai_100_exec_kv
+from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
 from QEfficient.peft import QEffAutoPeftModelForCausalLM
 from QEfficient.transformers.transform import transform
 
@@ -21,7 +21,6 @@
     "export",
     "compile",
     "cloud_ai_100_exec_kv",
-    "cloud_ai_100_exec_embed",
     "QEffAutoModel",
     "QEFFAutoModelForCausalLM",
     "QEffAutoPeftModelForCausalLM",
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index e6f3ba3b0..4ddd57ada 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -17,7 +17,7 @@
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.utils import constants, padding_check_and_fix
+from QEfficient.utils import padding_check_and_fix
 from QEfficient.utils.logging_utils import logger
 
 
@@ -347,68 +347,6 @@ def cloud_ai_100_exec_kv(
     return exec_info
 
 
-def cloud_ai_100_exec_embed(
-    tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
-    qpc_path: str,
-    prompts: List[str],
-    device_id: List[int] = [0],
-    enable_debug_logs: bool = False,
-) -> dict:
-    """
-    This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
-    This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed.
-    If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped.
-
-    ``Mandatory`` Args:
-        :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer.
-        :qpc_path (str): Path to the saved generated binary file after compilation.
-        :prompt (str): Sample prompt for the model text generation.
-    ``Optional`` Args:
-        :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``.
-
-    Returns:
-        :dict: Output from the ``AI_100`` runtime.
-    """
-    generate_feature = FeatureGeneration(
-        tokenizer=tokenizer,
-        qpc_path=qpc_path,
-        device_id=device_id,
-        enable_debug_logs=enable_debug_logs,
-    )
-
-    return generate_feature.generate(prompts=prompts)
-
-
-def pytorch_feature_generate(
-    model,
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    prompts: List[str],
-    seq_len: int = constants.Constants.CTX_LEN,
-):
-    """
-     Generates features from a list of text prompts using a PyTorch model and tokenizer.
-
-    ``Mandatory`` Args:
-         model: The PyTorch model used for generating features.
-         tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): The tokenizer used to preprocess the prompts.
-         prompts (List[str]): A list of text prompts to be tokenized and processed.
-     ``Optional`` Args:
-         seq_len (int, optional): The maximum sequence length for tokenization. Defaults to constants.Constants.CTX_LEN.
-
-     Returns:
-         List[torch.Tensor]: A list of output features generated by the model for each prompt.
-    """
-
-    outputs = []
-    for prompt in prompts:
-        inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
-        import ipdb
-
-        ipdb.set_trace()
-        outputs.append(model(**inputs))
-    return outputs
-
-
 class QEffTextGenerationBase:
     def __init__(
         self,
@@ -1132,70 +1070,3 @@ def generate(
             perf_metrics=perf_metrics,
         )
         return latency_stats
-
-
-class QEffFeatureGenerationBase:
-    def __init__(
-        self,
-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-        qpc_path: str,
-        ctx_len: Optional[int] = None,
-        device_id: Optional[List[int]] = None,
-        enable_debug_logs: bool = False,
-    ) -> None:
-        self.ctx_len = ctx_len
-
-        # Load QPC
-        self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs)
-
-        self._batch_size = self._session.bindings[0].dims[0]
-        self._seq_len = self._session.bindings[0].dims[1]
-
-        self.tokenizer = tokenizer
-        self._set_tokenizer_params()  # set tokenizer params
-
-    def _set_tokenizer_params(self):
-        """
-        Sets the tokenizer parameters for the model.
-        """
-        if self.tokenizer.padding_side != "right":
-            logger.warning("Please use padding_side='right' while initializing the tokenizer")
-            self.tokenizer.padding_side = "right"
-        if self.tokenizer.pad_token_id is None:
-            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-
-
-class FeatureGeneration:
-    def __init__(
-        self,
-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-        qpc_path: str,
-        seq_len: Optional[int] = None,
-        device_id: Optional[List[int]] = None,
-        enable_debug_logs: bool = False,
-    ) -> None:
-        self._qaic_model = QEffFeatureGenerationBase(tokenizer, qpc_path, seq_len, device_id, enable_debug_logs)
-        self._batch_size = self._qaic_model._batch_size
-        self._tokenizer = self._qaic_model.tokenizer
-        self._seq_len = self._qaic_model._seq_len
-        self._session = self._qaic_model._session
-
-    def generate(self, prompts: List[str]):
-        outputs = []
-
-        for prompt in prompts:
-            inputs = self._tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=self._seq_len)
-
-            inputs = dict(
-                input_ids=inputs["input_ids"].numpy(),
-                attention_mask=inputs["attention_mask"].numpy(),
-            )
-            output = {
-                "output": np.random.randn(self._batch_size, self._seq_len, self._session.bindings[2].dims[2]).astype(
-                    np.float32
-                ),
-            }
-            self._session.set_buffers(output)
-            output = self._session.run(inputs)
-            outputs.append(output)
-        return outputs
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index efb73fc82..01237db2a 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -11,6 +11,7 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
 from transformers import AutoModel, AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -18,11 +19,12 @@
 import QEfficient
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
-from QEfficient.generation.text_generation_inference import pytorch_feature_generate
+from QEfficient.generation.cloud_infer import QAICInferenceSession
 from QEfficient.transformers.models.pytorch_transforms import CustomOpsTransform, KVCacheTransform, SpDTransform
 from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
 from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform
 from QEfficient.utils import constants, get_padding_shape_from_config
+from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.cache import to_hashable
 
 logger = logging.getLogger(__file__)
@@ -145,7 +147,7 @@ def from_pretrained(
             model = QEFFAutoModelForCausalLM.from_pretrained("gpt2")
 
             # Now you can directly compile the model for Cloud AI 100
-            model.compile(num_cores=14, device_group=[0])  # Considering you have a Cloud AI 100 Standard SKU
+            model.compile(num_cores=6, device_group=[0])  # Considering you have a Cloud AI 100 Standard SKU
 
             # You can now execute the model
             model.generate(prompts=["Hi there!!"])
@@ -396,6 +398,7 @@ def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
         self.model.config.use_cache = True
         self.num_layers = model.config.num_hidden_layers
+        self.tokenizer = load_hf_tokenizer(self.model.config.name_or_path)
 
     @classmethod
     @with_replaced_quantizers
@@ -416,7 +419,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             model = QEFFAutoModel.from_pretrained("BAAI/bge-small-en-v1.5")
 
             # Now you can directly compile the model for Cloud AI 100
-            model.compile(num_cores=14, device_group=[0])  # Considering you have a Cloud AI 100 Standard SKU
+            model.compile(num_cores=14, device_group=[0])  # Considering you have a Cloud AI 100 SKU
 
             # You can now execute the model
             model.generate(prompts=["Hi there!!"])
@@ -524,9 +527,8 @@ def compile(
 
     def generate(
         self,
-        tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
         prompts: List[str],
-        device_id: List[int] = [0],
+        device_ids: List[int] = [0],
         runtime_ai100: bool = True,
         seq_len: int = constants.Constants.CTX_LEN,
     ) -> dict:
@@ -550,9 +552,73 @@ def generate(
             if not isinstance(self.qpc_path, Path):
                 raise TypeError("Please run compile API first!")
 
-            return QEfficient.cloud_ai_100_exec_embed(
-                tokenizer=tokenizer, prompts=prompts, qpc_path=self.qpc_path, device_id=device_id
-            )
+            return self.cloud_ai_100_feature_generate(prompts=prompts, device_ids=device_ids)
         # PyTorch runtime
         else:
-            return pytorch_feature_generate(model=self.model, tokenizer=tokenizer, prompts=prompts, seq_len=seq_len)
+            return self.pytorch_feature_generate(model=self.model, prompts=prompts, seq_len=seq_len)
+
+    def cloud_ai_100_feature_generate(
+        self,
+        prompts: List[str],
+        device_ids: List[int] = [0],
+    ):
+        """
+        Generates features using the QAICInferenceSession for a list of prompts.
+
+        This function initializes a QAICInferenceSession if not already initialized,
+        tokenizes the input prompts, and generates output features using the session.
+
+        Args:
+            prompts (List[str]): A list of input prompts to generate features for.
+            device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0].
+
+        Returns:
+            List[Dict[str, np.ndarray]]: A list of dictionaries containing the generated output features.
+        """
+        if self.qpc_session is None:
+            self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)
+            self.batch_size = self.qpc_session.bindings[0].dims[0]
+            self.seq_len = self.qpc_session.bindings[0].dims[1]
+        outputs = []
+
+        for prompt in prompts:
+            inputs = self.tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=self.seq_len)
+
+            inputs = dict(
+                input_ids=inputs["input_ids"].numpy(),
+                attention_mask=inputs["attention_mask"].numpy(),
+            )
+            output = {
+                "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype(
+                    np.float32
+                ),
+            }
+            self.qpc_session.set_buffers(output)
+            output = self.qpc_session.run(inputs)
+            outputs.append(output)
+        return outputs
+
+    def pytorch_feature_generate(
+        self,
+        model,
+        prompts: List[str],
+        seq_len: int = constants.Constants.CTX_LEN,
+    ):
+        """
+        Generates features from a list of text prompts using a PyTorch model.
+
+        ``Mandatory`` Args:
+            model: The PyTorch model used for generating features.
+            prompts (List[str]): A list of text prompts to be tokenized and processed.
+        ``Optional`` Args:
+            seq_len (int, optional): The maximum sequence length for tokenization. Defaults to constants.Constants.CTX_LEN.
+
+        Returns:
+            List[torch.Tensor]: A list of output features generated by the model for each prompt.
+        """
+
+        outputs = []
+        for prompt in prompts:
+            inputs = self.tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
+            outputs.append(model(**inputs))
+        return outputs
diff --git a/pyproject.toml b/pyproject.toml
index a4d3c0be9..fbffbd317 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,6 @@ dependencies = [
     "numpy==1.23.0",
     "protobuf==3.20.2",
     "onnxscript==0.1.0.dev20240327",
-    "einops==0.8.0",
     "sympy",
     "torch==2.4.1; platform_machine=='aarch64'",
     # Specifying torch cpu package URL per python version, update the list once pytorch releases whl for python>3.11
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index ab3ba7ba4..ed41c7349 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -9,15 +9,14 @@
 import numpy as np
 import onnxruntime as ort
 import pytest
-from transformers import AutoTokenizer
 
 from QEfficient.transformers.models.modeling_auto import QEffAutoModel
-from QEfficient.utils import hf_download, padding_check_and_fix
+from QEfficient.utils import hf_download
+from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.constants import Constants
 
 embed_test_models = [
     # model_name, architecture
-    "nomic-ai/nomic-embed-text-v1.5",  # NomicBertModel
     "sentence-transformers/multi-qa-mpnet-base-cos-v1",  # MPNetForMaskedLM
     "BAAI/bge-reranker-v2-m3",  # XLMRobertaForSequenceClassification
     "BAAI/bge-small-en-v1.5",  # BertModel
@@ -42,15 +41,13 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     )
 
     prompt = "My name is"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    padding_check_and_fix(tokenizer)
-    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
-
-    pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompts=["My name is"], runtime_ai100=False)
+    pt_outputs = qeff_model.generate(prompts=["My name is"], runtime_ai100=False)
 
     onnx_model = qeff_model.export()
     ort_session = ort.InferenceSession(str(onnx_model))
     # Prepare the inputs for ONNX Runtime
+    tokenizer = load_hf_tokenizer(model_path)
+    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
     onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()}
     # Run inference
     onnx_outputs = ort_session.run(None, onnx_inputs)
@@ -65,7 +62,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     qeff_model.compile(
         num_cores=14,
     )
-    ai100_output = qeff_model.generate(tokenizer=tokenizer, prompts=["My name is"])
+    ai100_output = qeff_model.generate(prompts=["My name is"])
 
     # Compare ONNX and AI 100 outputs
     mad = np.mean(np.abs(ai100_output[0]["output"] - onnx_outputs[0]))

From 206c81a0bd60ede7f29e098b92c711e0706c7f11 Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Tue, 17 Dec 2024 13:54:00 +0530
Subject: [PATCH 13/17] Minor fix-1

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 QEfficient/transformers/models/modeling_auto.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 01237db2a..ab87242e6 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -391,7 +391,7 @@ class QEffAutoModel(QEFFTransformersBase):
     """
 
     _hf_auto_class = AutoModel
-    _pytorch_transforms = [CustomOpsTransform]
+    _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
     _onnx_transforms = [FP16ClipTransform]
 
     def __init__(self, model: nn.Module, **kwargs):
@@ -416,10 +416,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             from QEfficient import QEFFAutoModel
 
             # Initialize the model using from_pretrained similar to transformers.AutoModel.
-            model = QEFFAutoModel.from_pretrained("BAAI/bge-small-en-v1.5")
+            model = QEFFAutoModel.from_pretrained("model_name")
 
             # Now you can directly compile the model for Cloud AI 100
-            model.compile(num_cores=14, device_group=[0])  # Considering you have a Cloud AI 100 SKU
+            model.compile(num_cores=16, device_group=[0])  # Considering you have a Cloud AI 100 SKU
 
             # You can now execute the model
             model.generate(prompts=["Hi there!!"])
@@ -563,17 +563,15 @@ def cloud_ai_100_feature_generate(
         device_ids: List[int] = [0],
     ):
         """
-        Generates features using the QAICInferenceSession for a list of prompts.
+        Generates features with list of prompts using AI 100 runtime.
 
-        This function initializes a QAICInferenceSession if not already initialized,
-        tokenizes the input prompts, and generates output features using the session.
-
-        Args:
+        ``Mandatory`` Args:
             prompts (List[str]): A list of input prompts to generate features for.
+        ``Optional`` Args:
             device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0].
 
         Returns:
-            List[Dict[str, np.ndarray]]: A list of dictionaries containing the generated output features.
+            List[Dict[np.ndarray]]: A list of dictionaries containing the generated output features.
         """
         if self.qpc_session is None:
             self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)

From 0f1f8bb4825da1189bcf16c32aa849ea03dcfaa6 Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Wed, 18 Dec 2024 14:34:48 +0530
Subject: [PATCH 14/17] fix-major

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 QEfficient/__init__.py                        |   4 +-
 QEfficient/base/__init__.py                   |   2 +-
 QEfficient/base/common.py                     |   2 +-
 .../transformers/models/modeling_auto.py      | 111 ++++++++++--------
 docs/source/hl_api.md                         |   7 +-
 .../models/test_embedding_models.py           |  52 ++++++--
 6 files changed, 108 insertions(+), 70 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 0f7f40483..8e32a1e6e 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,7 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
-from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
+from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
 from QEfficient.compile.compile_helper import compile
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
@@ -21,7 +21,7 @@
     "export",
     "compile",
     "cloud_ai_100_exec_kv",
-    "QEffAutoModel",
+    "QEFFAutoModel",
     "QEFFAutoModelForCausalLM",
     "QEffAutoPeftModelForCausalLM",
     "QEFFCommonLoader",
diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py
index 257051d97..86cff11c1 100644
--- a/QEfficient/base/__init__.py
+++ b/QEfficient/base/__init__.py
@@ -6,4 +6,4 @@
 # -----------------------------------------------------------------------------
 
 from QEfficient.base.common import QEFFCommonLoader  # noqa: F401
-from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM  # noqa: F401
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModel, QEFFAutoModelForCausalLM  # noqa: F401
diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index e42e74687..192294738 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -76,7 +76,7 @@ def __init__(self, *args: Any, **kwds: Any) -> None:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel:
         """
-        Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model.
+        Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model.
         """
         if not os.path.isdir(pretrained_model_name_or_path):
             pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index ab87242e6..9233ed563 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -372,9 +372,9 @@ def generate(
             raise ValueError("Only AI_100 runtime is supported right now via generate API")
 
 
-class QEffAutoModel(QEFFTransformersBase):
+class QEFFAutoModel(QEFFTransformersBase):
     """
-    The QEffAutoModel class is designed for manipulating any transformer model from the HuggingFace hub.
+    The QEFFAutoModel class is designed for manipulating any transformer model from the HuggingFace hub.
     Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization.
 
     ``Mandatory`` Args:
@@ -382,12 +382,21 @@ class QEffAutoModel(QEFFTransformersBase):
 
     .. code-block:: python
 
-        from QEfficient import QEffAutoModel
+        from QEfficient import QEFFAutoModel
+        from transformers import AutoTokenizer
 
-        model = QEffAutoModel.from_pretrained(model_name, num_hidden_layers=2)
-        model.compile()
+        # Initialize the model using from_pretrained similar to transformers.AutoModel.
+        model = QEFFAutoModel.from_pretrained("model_name")
 
-        model.generate(prompts=["Hello, world!"])
+        # Now you can directly compile the model for Cloud AI 100
+        model.compile(num_cores=16, device_group=[0])  # Considering you have a Cloud AI 100 SKU
+
+        #prepare input
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        inputs = tokenizer("My name is", return_tensors="pt")
+
+        # You can now execute the model
+        model.generate(inputs)
     """
 
     _hf_auto_class = AutoModel
@@ -414,6 +423,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         .. code-block:: python
 
             from QEfficient import QEFFAutoModel
+            from transformers import AutoTokenizer
 
             # Initialize the model using from_pretrained similar to transformers.AutoModel.
             model = QEFFAutoModel.from_pretrained("model_name")
@@ -421,8 +431,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             # Now you can directly compile the model for Cloud AI 100
             model.compile(num_cores=16, device_group=[0])  # Considering you have a Cloud AI 100 SKU
 
+            #prepare input
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            inputs = tokenizer("My name is", return_tensors="pt")
+
             # You can now execute the model
-            model.generate(prompts=["Hi there!!"])
+            model.generate(inputs)
         """
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
@@ -458,7 +472,7 @@ def export(self, export_dir: Optional[str] = None) -> str:
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
 
         ``Optional`` Args:
-            does not any arguments.
+           :export_dir (str, optional): The directory path to store ONNX-graph.
 
         Returns:
             :str: Path of the generated ``ONNX`` graph.
@@ -504,7 +518,11 @@ def compile(
             :compile_dir (str, optional): Path for saving the qpc generated.
             :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
             :batch_size (int, optional): Batch size. ``Defaults to 1``.
+            :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1.
             :num_cores (int): Number of cores used to compile the model.
+            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
+            :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
         Returns:
             :str: Path of the compiled ``qpc`` package.
         """
@@ -527,96 +545,85 @@ def compile(
 
     def generate(
         self,
-        prompts: List[str],
+        inputs: Union[torch.Tensor, np.ndarray],
         device_ids: List[int] = [0],
         runtime_ai100: bool = True,
         seq_len: int = constants.Constants.CTX_LEN,
     ) -> dict:
         """
-        This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
-        This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed.
-        If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped.
-
+        This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
         ``Mandatory`` Args:
-            :prompts (List[str]): List of prompts to run the execution.
+            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         ``optional`` Args:
             :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
-
+            :eq_len (int, optional): Sequence length for the inputs. Defaults to constants.Constants.CTX_LEN.
         Returns:
             :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
         """
+        # Prepare input
+        input_ids = torch.nn.functional.pad(
+            inputs["input_ids"], (0, seq_len - inputs["input_ids"].size(1)), "constant", 0
+        )
+        attention_mask = torch.nn.functional.pad(
+            inputs["attention_mask"], (0, seq_len - inputs["attention_mask"].size(1)), "constant", 0
+        )
+
+        inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
 
         # AI_100 runtime
         if runtime_ai100:
             if not isinstance(self.qpc_path, Path):
                 raise TypeError("Please run compile API first!")
 
-            return self.cloud_ai_100_feature_generate(prompts=prompts, device_ids=device_ids)
+            return self.cloud_ai_100_feature_generate(inputs=inputs, device_ids=device_ids)
         # PyTorch runtime
         else:
-            return self.pytorch_feature_generate(model=self.model, prompts=prompts, seq_len=seq_len)
+            return self.pytorch_feature_generate(model=self.model, inputs=inputs)
 
     def cloud_ai_100_feature_generate(
         self,
-        prompts: List[str],
+        inputs: Union[torch.Tensor, np.ndarray],
         device_ids: List[int] = [0],
     ):
         """
         Generates features with list of prompts using AI 100 runtime.
 
         ``Mandatory`` Args:
-            prompts (List[str]): A list of input prompts to generate features for.
+            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
         ``Optional`` Args:
             device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0].
 
         Returns:
-            List[Dict[np.ndarray]]: A list of dictionaries containing the generated output features.
+           np.ndarray: A list of dictionaries containing the generated output features.
         """
+
         if self.qpc_session is None:
             self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)
             self.batch_size = self.qpc_session.bindings[0].dims[0]
             self.seq_len = self.qpc_session.bindings[0].dims[1]
-        outputs = []
 
-        for prompt in prompts:
-            inputs = self.tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=self.seq_len)
-
-            inputs = dict(
-                input_ids=inputs["input_ids"].numpy(),
-                attention_mask=inputs["attention_mask"].numpy(),
-            )
-            output = {
-                "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype(
-                    np.float32
-                ),
-            }
-            self.qpc_session.set_buffers(output)
-            output = self.qpc_session.run(inputs)
-            outputs.append(output)
+        inputs["input_ids"] = np.array(inputs["input_ids"])
+        inputs["attention_mask"] = np.array(inputs["attention_mask"])
+        outputs = {
+            "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype(
+                np.float32
+            ),
+        }
+        self.qpc_session.set_buffers(outputs)
+        outputs = self.qpc_session.run(inputs)
         return outputs
 
-    def pytorch_feature_generate(
-        self,
-        model,
-        prompts: List[str],
-        seq_len: int = constants.Constants.CTX_LEN,
-    ):
+    def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]):
         """
         Generates features from a list of text prompts using a PyTorch model.
 
         ``Mandatory`` Args:
-            model: The PyTorch model used for generating features.
-            prompts (List[str]): A list of text prompts to be tokenized and processed.
-        ``Optional`` Args:
-            seq_len (int, optional): The maximum sequence length for tokenization. Defaults to constants.Constants.CTX_LEN.
+            model: The transformed PyTorch model used for generating features.
+            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
 
         Returns:
-            List[torch.Tensor]: A list of output features generated by the model for each prompt.
+            torch.Tensor: A list of output features generated by the model for each prompt.
         """
 
-        outputs = []
-        for prompt in prompts:
-            inputs = self.tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
-            outputs.append(model(**inputs))
-        return outputs
+        return model(**inputs)
diff --git a/docs/source/hl_api.md b/docs/source/hl_api.md
index 47dd6cde8..558965e76 100644
--- a/docs/source/hl_api.md
+++ b/docs/source/hl_api.md
@@ -8,7 +8,12 @@
    :member-order: bysource
    :members:
 ``` 
-
+## `QEFFAutoModel`
+```{eval-rst}
+.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel
+   :member-order: bysource
+   :members:
+``` 
 ## `QEffAutoPeftModelForCausalLM`
 ```{eval-rst}
 .. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index ed41c7349..2de882b9c 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -9,10 +9,11 @@
 import numpy as np
 import onnxruntime as ort
 import pytest
+import torch
+from transformers import AutoModel, AutoTokenizer
 
-from QEfficient.transformers.models.modeling_auto import QEffAutoModel
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
 from QEfficient.utils import hf_download
-from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.constants import Constants
 
 embed_test_models = [
@@ -32,40 +33,65 @@ def check_embed_pytorch_vs_ort_vs_ai100(
         repo_id=model_name,
         ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
     )
+    # Prepare input
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    inputs = tokenizer("My name is", return_tensors="pt")
 
-    qeff_model = QEffAutoModel.from_pretrained(
-        pretrained_model_name_or_path=model_path,
+    input_ids = torch.nn.functional.pad(inputs["input_ids"], (0, seq_len - inputs["input_ids"].size(1)), "constant", 0)
+    attention_mask = torch.nn.functional.pad(
+        inputs["attention_mask"], (0, seq_len - inputs["attention_mask"].size(1)), "constant", 0
+    )
+    inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
+
+    # Original PyTorch model
+    pt_model = AutoModel.from_pretrained(
+        model_path,
         num_hidden_layers=n_layer,
         attn_implementation="eager",
         trust_remote_code=True,
     )
 
-    prompt = "My name is"
-    pt_outputs = qeff_model.generate(prompts=["My name is"], runtime_ai100=False)
+    pt_outputs = pt_model(**inputs)
+    pt_embeddings = pt_outputs[0][0].detach().numpy()
+
+    # Pytorch transformed model
+    qeff_model = QEFFAutoModel.from_pretrained(
+        pretrained_model_name_or_path=model_path,
+        num_hidden_layers=n_layer,
+        attn_implementation="eager",
+        trust_remote_code=True,
+    )
+    qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
+    qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
+    mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
+    print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad)
+    assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}"
 
     onnx_model = qeff_model.export()
     ort_session = ort.InferenceSession(str(onnx_model))
+
     # Prepare the inputs for ONNX Runtime
-    tokenizer = load_hf_tokenizer(model_path)
-    inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len)
-    onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()}
+    input_ids = np.array(input_ids)
+    attention_mask = np.array(attention_mask)
+
+    onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
     # Run inference
     onnx_outputs = ort_session.run(None, onnx_inputs)
 
-    # Compare PyTorch and ONNX outputs
+    # Compare Transformed PyTorch and ONNX outputs
     pt_embeddings = pt_outputs[0][0].detach().numpy()
     onnx_embeddings = onnx_outputs[0]
     mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
-    print("Mad for onnx and pytorch is ", mad)
+    print("Mad for onnx and PyTorch is ", mad)
     assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}"
 
     qeff_model.compile(
         num_cores=14,
     )
-    ai100_output = qeff_model.generate(prompts=["My name is"])
+    ai100_output = qeff_model.generate(inputs=inputs)
 
     # Compare ONNX and AI 100 outputs
-    mad = np.mean(np.abs(ai100_output[0]["output"] - onnx_outputs[0]))
+    mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
     assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
 

From 6c9de4b414b4e2fa4076980119fee3f8ee3829c8 Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Wed, 18 Dec 2024 16:02:06 +0530
Subject: [PATCH 15/17] fix-minor-2

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 QEfficient/base/modeling_qeff.py              |  4 +-
 .../transformers/models/modeling_auto.py      | 37 +++++++++----------
 .../models/test_embedding_models.py           | 29 +++------------
 3 files changed, 27 insertions(+), 43 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index 064d7e6f0..82fc42215 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -251,7 +251,9 @@ def _compile(
 
         # Check if already compiled
         compile_hash = compile_hash.hexdigest()[:16]
-        qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
+        compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
+        qpc_path = compile_dir / "qpc"
+        qpc_path.mkdir(parents=True, exist_ok=True)
         if qpc_path.is_dir():
             if (qpc_path / "programqpc.bin").is_file():
                 self.qpc_path = qpc_path
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 9233ed563..de21f070b 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -24,7 +24,6 @@
 from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
 from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform
 from QEfficient.utils import constants, get_padding_shape_from_config
-from QEfficient.utils._utils import load_hf_tokenizer
 from QEfficient.utils.cache import to_hashable
 
 logger = logging.getLogger(__file__)
@@ -369,7 +368,7 @@ def generate(
                 is_tlm=self.is_tlm,
             )
         else:
-            raise ValueError("Only AI_100 runtime is supported right now via generate API")
+            raise NotImplementedError("Only AI_100 runtime is supported right now via generate API")
 
 
 class QEFFAutoModel(QEFFTransformersBase):
@@ -401,13 +400,12 @@ class QEFFAutoModel(QEFFTransformersBase):
 
     _hf_auto_class = AutoModel
     _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
-    _onnx_transforms = [FP16ClipTransform]
+    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
     def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model)
         self.model.config.use_cache = True
         self.num_layers = model.config.num_hidden_layers
-        self.tokenizer = load_hf_tokenizer(self.model.config.name_or_path)
 
     @classmethod
     @with_replaced_quantizers
@@ -447,6 +445,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False})
         try:
             model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+            warnings.warn("Removing pooling layer from the model if exist")
         except TypeError:
             kwargs.pop("add_pooling_layer", None)
             model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
@@ -545,10 +544,9 @@ def compile(
 
     def generate(
         self,
-        inputs: Union[torch.Tensor, np.ndarray],
+        inputs: torch.Tensor,
         device_ids: List[int] = [0],
         runtime_ai100: bool = True,
-        seq_len: int = constants.Constants.CTX_LEN,
     ) -> dict:
         """
         This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
@@ -561,16 +559,6 @@ def generate(
         Returns:
             :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
         """
-        # Prepare input
-        input_ids = torch.nn.functional.pad(
-            inputs["input_ids"], (0, seq_len - inputs["input_ids"].size(1)), "constant", 0
-        )
-        attention_mask = torch.nn.functional.pad(
-            inputs["attention_mask"], (0, seq_len - inputs["attention_mask"].size(1)), "constant", 0
-        )
-
-        inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
-
         # AI_100 runtime
         if runtime_ai100:
             if not isinstance(self.qpc_path, Path):
@@ -583,7 +571,7 @@ def generate(
 
     def cloud_ai_100_feature_generate(
         self,
-        inputs: Union[torch.Tensor, np.ndarray],
+        inputs: torch.Tensor,
         device_ids: List[int] = [0],
     ):
         """
@@ -602,9 +590,19 @@ def cloud_ai_100_feature_generate(
             self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)
             self.batch_size = self.qpc_session.bindings[0].dims[0]
             self.seq_len = self.qpc_session.bindings[0].dims[1]
+        # Prepare input
+        input_ids_len = inputs["input_ids"].shape[1]
+        input_ids = np.array(
+            torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - inputs["input_ids"].size(1)), "constant", 0)
+        )
+        attention_mask = np.array(
+            torch.nn.functional.pad(
+                inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0
+            )
+        )
+
+        inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
 
-        inputs["input_ids"] = np.array(inputs["input_ids"])
-        inputs["attention_mask"] = np.array(inputs["attention_mask"])
         outputs = {
             "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype(
                 np.float32
@@ -612,6 +610,7 @@ def cloud_ai_100_feature_generate(
         }
         self.qpc_session.set_buffers(outputs)
         outputs = self.qpc_session.run(inputs)
+        outputs = outputs["output"][:, :input_ids_len, :]
         return outputs
 
     def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]):
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 2de882b9c..1c2d5196c 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -9,11 +9,9 @@
 import numpy as np
 import onnxruntime as ort
 import pytest
-import torch
 from transformers import AutoModel, AutoTokenizer
 
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModel
-from QEfficient.utils import hf_download
 from QEfficient.utils.constants import Constants
 
 embed_test_models = [
@@ -29,23 +27,13 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     seq_len: int = Constants.CTX_LEN,
     n_layer: int = 1,
 ):
-    model_path = hf_download(
-        repo_id=model_name,
-        ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
-    )
     # Prepare input
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     inputs = tokenizer("My name is", return_tensors="pt")
 
-    input_ids = torch.nn.functional.pad(inputs["input_ids"], (0, seq_len - inputs["input_ids"].size(1)), "constant", 0)
-    attention_mask = torch.nn.functional.pad(
-        inputs["attention_mask"], (0, seq_len - inputs["attention_mask"].size(1)), "constant", 0
-    )
-    inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
-
     # Original PyTorch model
     pt_model = AutoModel.from_pretrained(
-        model_path,
+        model_name,
         num_hidden_layers=n_layer,
         attn_implementation="eager",
         trust_remote_code=True,
@@ -53,14 +41,8 @@ def check_embed_pytorch_vs_ort_vs_ai100(
 
     pt_outputs = pt_model(**inputs)
     pt_embeddings = pt_outputs[0][0].detach().numpy()
-
     # Pytorch transformed model
-    qeff_model = QEFFAutoModel.from_pretrained(
-        pretrained_model_name_or_path=model_path,
-        num_hidden_layers=n_layer,
-        attn_implementation="eager",
-        trust_remote_code=True,
-    )
+    qeff_model = QEFFAutoModel(pt_model)
     qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
     qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy()
     mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings))
@@ -71,14 +53,15 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     ort_session = ort.InferenceSession(str(onnx_model))
 
     # Prepare the inputs for ONNX Runtime
-    input_ids = np.array(input_ids)
-    attention_mask = np.array(attention_mask)
+    input_ids = np.array(inputs["input_ids"])
+    attention_mask = np.array(inputs["attention_mask"])
 
     onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
     # Run inference
     onnx_outputs = ort_session.run(None, onnx_inputs)
 
     # Compare Transformed PyTorch and ONNX outputs
+
     pt_embeddings = pt_outputs[0][0].detach().numpy()
     onnx_embeddings = onnx_outputs[0]
     mad = np.mean(np.abs(pt_embeddings - onnx_embeddings))
@@ -91,7 +74,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     ai100_output = qeff_model.generate(inputs=inputs)
 
     # Compare ONNX and AI 100 outputs
-    mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0]))
+    mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
     assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
 

From 88e0fe653fe895996a5db38324b3a76fe2cb5b0b Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Wed, 18 Dec 2024 17:33:12 +0530
Subject: [PATCH 16/17] fix-minor-3

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 QEfficient/transformers/models/modeling_auto.py |  7 +++----
 scripts/Jenkinsfile                             | 12 ++++++------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index de21f070b..83c573f6d 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -547,7 +547,7 @@ def generate(
         inputs: torch.Tensor,
         device_ids: List[int] = [0],
         runtime_ai100: bool = True,
-    ) -> dict:
+    ) -> Union[torch.Tensor, np.ndarray]:
         """
         This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
         ``Mandatory`` Args:
@@ -573,7 +573,7 @@ def cloud_ai_100_feature_generate(
         self,
         inputs: torch.Tensor,
         device_ids: List[int] = [0],
-    ):
+    ) -> np.ndarray:
         """
         Generates features with list of prompts using AI 100 runtime.
 
@@ -613,7 +613,7 @@ def cloud_ai_100_feature_generate(
         outputs = outputs["output"][:, :input_ids_len, :]
         return outputs
 
-    def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]):
+    def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]:
         """
         Generates features from a list of text prompts using a PyTorch model.
 
@@ -624,5 +624,4 @@ def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray
         Returns:
             torch.Tensor: A list of output features generated by the model for each prompt.
         """
-
         return model(**inputs)
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 2e6f17f4e..f1d37fe86 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -13,8 +13,8 @@ pipeline {
            steps {
                sh '''
                    . ~/.bashrc
-                   docker run --privileged -dit --name ${BUILD_TAG} -v ./:/efficient-transformers -v ${HF_PATH}:${DOCKER_HF_PATH} ${DOCKER_LATEST}:master_latest
-                   docker exec ${BUILD_TAG} bash -c "
+                   sudo docker run --privileged -dit --name ${BUILD_TAG} -v ./:/efficient-transformers -v ${HF_PATH}:${DOCKER_HF_PATH} ${DOCKER_LATEST}:master_latest
+                   sudo docker exec ${BUILD_TAG} bash -c "
                    cd /efficient-transformers &&
                    apt update &&
                    apt install -y python3.10-venv &&
@@ -34,7 +34,7 @@ pipeline {
                    steps {
                        timeout(time: 10, unit: 'MINUTES') {
                            sh '''
-                           docker exec ${BUILD_TAG} bash -c "
+                           sudo docker exec ${BUILD_TAG} bash -c "
                            cd /efficient-transformers &&
                            . preflight_qeff/bin/activate &&
                            mkdir -p $PWD/Non_cli_qaic &&
@@ -50,7 +50,7 @@ pipeline {
                    steps {
                        timeout(time: 60, unit: 'MINUTES') {
                            sh '''
-                           docker exec ${BUILD_TAG} bash -c "
+                           sudo docker exec ${BUILD_TAG} bash -c "
                            cd /efficient-transformers &&
                            . preflight_qeff/bin/activate &&
                            mkdir -p $PWD/Non_qaic &&
@@ -68,7 +68,7 @@ pipeline {
                    steps {
                        timeout(time: 15, unit: 'MINUTES') {
                            sh '''
-                           docker exec ${BUILD_TAG} bash -c "
+                           sudo docker exec ${BUILD_TAG} bash -c "
                            cd /efficient-transformers &&
                            . preflight_qeff/bin/activate &&
                            mkdir -p $PWD/cli &&
@@ -88,7 +88,7 @@ pipeline {
            script {
                try {
                    sh '''
-                   docker rm -f ${BUILD_TAG}
+                   sudo docker rm -f ${BUILD_TAG}
                    sudo chown -R ubuntu .
                    '''
                } catch (error) {

From 157142a274c821f34f5a53e1a6f60987ac4c5d1b Mon Sep 17 00:00:00 2001
From: amitraj <quic_amitraj@quicinc.com>
Date: Wed, 18 Dec 2024 20:24:04 +0530
Subject: [PATCH 17/17] Update ONNX_EXPORT_OPSET to 13

Signed-off-by: amitraj <quic_amitraj@quicinc.com>
---
 QEfficient/utils/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 5e3a29072..4a3ba3ff3 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -47,7 +47,7 @@ def get_models_dir():
 ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32
 ONNX_EXPORT_EXAMPLE_FBS = 4
 ONNX_EXPORT_EXAMPLE_NLK = 2  # Number of Logits to Keep
-ONNX_EXPORT_OPSET = 14
+ONNX_EXPORT_OPSET = 13
 
 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]