From 0b80dacf623150a0359272994246e54e4a1cd772 Mon Sep 17 00:00:00 2001 From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com> Date: Wed, 20 Nov 2024 22:24:09 +0530 Subject: [PATCH 01/17] Docker-driven tests with latest SDKs (#180) * Added Docker support to the Jenkins tests Signed-off-by: amitraj * Addressed comments Signed-off-by: amitraj * updated qaic tests time upper limit to 60 minutes Signed-off-by: Onkar Chougule --------- Signed-off-by: amitraj * Added support for Embedding moodels --- QEfficient/__init__.py | 3 +- .../generation/text_generation_inference.py | 24 +++++- .../transformers/models/modeling_auto.py | 84 ++++++++++++++++--- .../transformers/models/pytorch_transforms.py | 3 + QEfficient/transformers/quantizers/auto.py | 1 - .../models/test_causal_lm_models.py | 59 ++++++++++++- 6 files changed, 159 insertions(+), 15 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 0f7f40483..987399316 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -8,7 +8,7 @@ from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader from QEfficient.compile.compile_helper import compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv +from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_embed, cloud_ai_100_exec_kv from QEfficient.peft import QEffAutoPeftModelForCausalLM from QEfficient.transformers.transform import transform @@ -21,6 +21,7 @@ "export", "compile", "cloud_ai_100_exec_kv", + "cloud_ai_100_exec_embed", "QEffAutoModel", "QEFFAutoModelForCausalLM", "QEffAutoPeftModelForCausalLM", diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 4ddd57ada..3f8692dbe 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -347,7 +347,29 @@ def cloud_ai_100_exec_kv( return exec_info -class QEffTextGenerationBase: +def cloud_ai_100_exec_embed( + tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], + prompt: List[str], + qpc_path: str, + device_id: List[int] = [0], +): + session = QAICInferenceSession(qpc_path, device_ids=device_id) + seq_len = session.bindings[0].dims[1] + inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) + + prefill_inputs = dict( + input_ids=inputs["input_ids"].numpy(), + attention_mask=inputs["attention_mask"].numpy(), + ) + prefill_logits = { + "output": np.random.randn(1, seq_len, session.bindings[2].dims[2]).astype(np.float32), + } + session.set_buffers(prefill_logits) + prefill_outputs = session.run(prefill_inputs) + return prefill_outputs + + +class TextGeneration: def __init__( self, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index d0bb4285f..719cdee3c 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -9,7 +9,7 @@ import logging import warnings from pathlib import Path -from typing import Any, List, Optional, Union +from typing import List, Optional, Union import torch import torch.nn as nn @@ -35,9 +35,6 @@ class QEFFTransformersBase(QEFFBaseModel): _hf_auto_class: type def __init__(self, model: nn.Module) -> None: - model_class_name = model.__class__.__name__ - if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")): - raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}") if hasattr(model.config, "quantization_config") and not isinstance( model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values()) @@ -386,11 +383,78 @@ def generate( class QEffAutoModel(QEFFTransformersBase): _hf_auto_class = AutoModel - _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + _pytorch_transforms = [CustomOpsTransform] + _onnx_transforms = [FP16ClipTransform] + + def __init__(self, model: nn.Module, continuous_batching: bool = False, **kwargs): + super().__init__(model) + self.model.config.use_cache = True + self.num_layers = model.config.num_hidden_layers + + def export(self, export_dir: Optional[str] = None) -> str: + seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + + example_inputs = { + "input_ids": torch.zeros((1, seq_len), dtype=torch.int64), + "attention_mask": torch.ones((1, seq_len), dtype=torch.int64), + } + + dynamic_axes = {"input_ids": {1: "seq_len"}, "attention_mask": {1: "seq_len"}} + + output_names = ["output"] + + return self._export( + example_inputs, + output_names, + dynamic_axes, + export_dir=export_dir, + ) + + def compile( + self, + onnx_path: Optional[str] = None, + compile_dir: Optional[str] = None, + *, + seq_len: int = 32, + num_cores: int = 14, # FIXME: Make this mandatory arg + **compiler_options, + ) -> str: + specializations = [ + {"seq_len": seq_len}, + ] + + return self._compile( + onnx_path, + compile_dir, + compile_only=True, + specializations=specializations, + convert_to_fp16=True, + aic_num_cores=num_cores, + **compiler_options, + ) + + def generate( + self, + tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], + prompt: List[str], + device_id: List[int] = [0], + runtime_ai100: bool = True, + seq_len: int = constants.Constants.CTX_LEN, + ): + if runtime_ai100: + if not isinstance(self.qpc_path, Path): + raise TypeError("Please run compile API first!") + + return QEfficient.cloud_ai_100_exec_embed( + tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id + ) + else: + inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) + return self.model(**inputs) + - def export(self): - raise NotImplementedError("Reached too far!!") + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): + self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - def compile(self, *args, **kwargs) -> Any: - raise NotImplementedError("Reached too far!!") + return self diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 6b8d00689..6a15befcf 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -305,6 +305,9 @@ class KVCacheTransform(ModuleMappingTransform): @classmethod def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: + import ipdb + + ipdb.set_trace() model, transformed = super().apply(model) # FIXME: see if we can merge into _module_mapping dict transformers.cache_utils.DynamicCache.update = QEffDynamicCache.update diff --git a/QEfficient/transformers/quantizers/auto.py b/QEfficient/transformers/quantizers/auto.py index f4cec3b54..b5b4be099 100644 --- a/QEfficient/transformers/quantizers/auto.py +++ b/QEfficient/transformers/quantizers/auto.py @@ -38,7 +38,6 @@ def wrapper(*args, **kwargs): # Put back quantization config and quantizer for k in QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.keys(): - AUTO_QUANTIZATION_CONFIG_MAPPING[k] = transformers_replaced_quantization_config_mapping[k] AUTO_QUANTIZER_MAPPING[k] = transformers_replaced_quantizer_mapping[k] return out diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 6e91711e0..de0aec0ca 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -8,11 +8,12 @@ from typing import Optional import numpy as np +import onnxruntime as ort import pytest -from transformers import AutoModelForCausalLM +from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers from QEfficient.utils import hf_download from QEfficient.utils._utils import load_hf_tokenizer @@ -179,6 +180,55 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." +def check_embed_pytorch_vs_ort_vs_ai100( + model_name: str, + seq_len: int = Constants.CTX_LEN, + n_layer: int = 1, +): + model_path = hf_download( + repo_id=model_name, + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + + # Try to initialize with add_pooling_layer parameter + try: + model = AutoModel.from_pretrained(model_name, add_pooling_layer=False) + qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path, add_pooling_layer=False) + except TypeError: + # If it fails, initialize without the parameter + model = AutoModel.from_pretrained(model_name) + qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path) + text = "My name is" + tokenizer = AutoTokenizer.from_pretrained(model_name) + inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=seq_len) + + pt_outputs=qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False) + + onnx_model = qeff_model.export() + ort_session = ort.InferenceSession(str(onnx_model)) + # Prepare the inputs for ONNX Runtime + onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()} + # Run inference + onnx_outputs = ort_session.run(None, onnx_inputs) + + # Compare PyTorch and ONNX outputs + pt_embeddings = pt_outputs[0].detach().numpy() + onnx_embeddings = onnx_outputs[0] + mad = np.mean(np.abs(pt_embeddings - onnx_embeddings)) + print("Mad for onnx and pytorch is ", mad) + assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}" + + qeff_model.compile( + num_cores=14, + ) + ai100_output = qeff_model.generate(tokenizer=tokenizer, prompt=["My name is"]) + + # Compare ONNX and AI 100 outputs + mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0])) + print("Mad for onnx and AI 100 output is ", mad) + assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}" + + # FIXME: there should be a CB test here @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x) def test_causal_lm_export_with_deprecated_api(model_name): @@ -252,3 +302,8 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): prompt_len = 1 check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) + +@pytest.mark.on_qaic +def test_embed_model_pytorch_vs_onnx_vs_ai100(): + model_name = "BAAI/bge-small-en-v1.5" + check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1) From e83b29a851f4171f8e8e72474790eb5960f59ef4 Mon Sep 17 00:00:00 2001 From: amitraj Date: Mon, 9 Dec 2024 10:25:35 +0530 Subject: [PATCH 02/17] Added support for embedding models Signed-off-by: amitraj --- QEfficient/generation/text_generation_inference.py | 2 +- QEfficient/transformers/models/modeling_auto.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 3f8692dbe..eaa62a926 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -369,7 +369,7 @@ def cloud_ai_100_exec_embed( return prefill_outputs -class TextGeneration: +class QEffTextGenerationBase: def __init__( self, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 719cdee3c..578bef184 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -440,7 +440,7 @@ def generate( device_id: List[int] = [0], runtime_ai100: bool = True, seq_len: int = constants.Constants.CTX_LEN, - ): + ) -> str: if runtime_ai100: if not isinstance(self.qpc_path, Path): raise TypeError("Please run compile API first!") From be592874643d276313aa91ac2907c991dbbd0bb3 Mon Sep 17 00:00:00 2001 From: amitraj Date: Mon, 9 Dec 2024 10:53:16 +0530 Subject: [PATCH 03/17] Lint & Format Signed-off-by: amitraj --- QEfficient/transformers/models/modeling_auto.py | 4 +--- QEfficient/transformers/models/pytorch_transforms.py | 3 --- tests/transformers/models/test_causal_lm_models.py | 7 +++---- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 578bef184..a803906df 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -35,7 +35,6 @@ class QEFFTransformersBase(QEFFBaseModel): _hf_auto_class: type def __init__(self, model: nn.Module) -> None: - if hasattr(model.config, "quantization_config") and not isinstance( model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values()) ): @@ -443,7 +442,7 @@ def generate( ) -> str: if runtime_ai100: if not isinstance(self.qpc_path, Path): - raise TypeError("Please run compile API first!") + raise TypeError("Please run compile API first!") return QEfficient.cloud_ai_100_exec_embed( tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id @@ -451,7 +450,6 @@ def generate( else: inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) return self.model(**inputs) - @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 6a15befcf..6b8d00689 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -305,9 +305,6 @@ class KVCacheTransform(ModuleMappingTransform): @classmethod def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: - import ipdb - - ipdb.set_trace() model, transformed = super().apply(model) # FIXME: see if we can merge into _module_mapping dict transformers.cache_utils.DynamicCache.update = QEffDynamicCache.update diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index de0aec0ca..b765ae609 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -10,7 +10,7 @@ import numpy as np import onnxruntime as ort import pytest -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM @@ -192,17 +192,15 @@ def check_embed_pytorch_vs_ort_vs_ai100( # Try to initialize with add_pooling_layer parameter try: - model = AutoModel.from_pretrained(model_name, add_pooling_layer=False) qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path, add_pooling_layer=False) except TypeError: # If it fails, initialize without the parameter - model = AutoModel.from_pretrained(model_name) qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path) text = "My name is" tokenizer = AutoTokenizer.from_pretrained(model_name) inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=seq_len) - pt_outputs=qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False) + pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False) onnx_model = qeff_model.export() ort_session = ort.InferenceSession(str(onnx_model)) @@ -303,6 +301,7 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) + @pytest.mark.on_qaic def test_embed_model_pytorch_vs_onnx_vs_ai100(): model_name = "BAAI/bge-small-en-v1.5" From 12841558ecbea676928da9b0a72fd85d7be0ebb8 Mon Sep 17 00:00:00 2001 From: amitraj Date: Mon, 9 Dec 2024 11:34:03 +0530 Subject: [PATCH 04/17] Added batch_size Signed-off-by: amitraj --- .../transformers/models/modeling_auto.py | 31 +++++++++++++++---- QEfficient/transformers/quantizers/auto.py | 1 + 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index a803906df..02b6e55e2 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -272,7 +272,7 @@ def compile( :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``. :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``. :full_batch_size (int, optional): Continuous batching batch size. - :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``. + :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. @@ -381,6 +381,23 @@ def generate( class QEffAutoModel(QEFFTransformersBase): + """ + The QEffAutoModel class is designed for manipulating any transformer model from the HuggingFace hub. + Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. + + ``Mandatory`` Args: + :model (nn.Module): PyTorch model + + .. code-block:: python + + from QEfficient import QEffAutoModel + + model = QEffAutoModel.from_pretrained(model_name, num_hidden_layers=2) + model.compile(prefill_seq_len=32, ctx_len=1024) + + model.generate(prompts=["Hello, world!"]) + """ + _hf_auto_class = AutoModel _pytorch_transforms = [CustomOpsTransform] _onnx_transforms = [FP16ClipTransform] @@ -391,14 +408,15 @@ def __init__(self, model: nn.Module, continuous_batching: bool = False, **kwargs self.num_layers = model.config.num_hidden_layers def export(self, export_dir: Optional[str] = None) -> str: + bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN example_inputs = { - "input_ids": torch.zeros((1, seq_len), dtype=torch.int64), - "attention_mask": torch.ones((1, seq_len), dtype=torch.int64), + "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), + "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64), } - dynamic_axes = {"input_ids": {1: "seq_len"}, "attention_mask": {1: "seq_len"}} + dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "attention_mask": {0: "batch_size", 1: "seq_len"}} output_names = ["output"] @@ -415,11 +433,12 @@ def compile( compile_dir: Optional[str] = None, *, seq_len: int = 32, - num_cores: int = 14, # FIXME: Make this mandatory arg + batch_size: int = 1, + num_cores: int = 16, # FIXME: Make this mandatory arg **compiler_options, ) -> str: specializations = [ - {"seq_len": seq_len}, + {"batch_size": batch_size, "seq_len": seq_len}, ] return self._compile( diff --git a/QEfficient/transformers/quantizers/auto.py b/QEfficient/transformers/quantizers/auto.py index b5b4be099..f4cec3b54 100644 --- a/QEfficient/transformers/quantizers/auto.py +++ b/QEfficient/transformers/quantizers/auto.py @@ -38,6 +38,7 @@ def wrapper(*args, **kwargs): # Put back quantization config and quantizer for k in QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.keys(): + AUTO_QUANTIZATION_CONFIG_MAPPING[k] = transformers_replaced_quantization_config_mapping[k] AUTO_QUANTIZER_MAPPING[k] = transformers_replaced_quantizer_mapping[k] return out From 3f95df74144a981d641f9f66427df0a82b636604 Mon Sep 17 00:00:00 2001 From: amitraj Date: Mon, 9 Dec 2024 12:11:27 +0530 Subject: [PATCH 05/17] Docstring added Signed-off-by: amitraj --- .../transformers/models/modeling_auto.py | 79 +++++++++++++++++-- 1 file changed, 71 insertions(+), 8 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 02b6e55e2..9cdfd47fc 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -186,7 +186,6 @@ def model_hash(self) -> str: def export(self, export_dir: Optional[str] = None) -> str: """ Exports the model to ``ONNX`` format using ``torch.onnx.export``. - We currently don't support exporting non-transformed models. Please refer to the ``convert_to_cloud_bertstyle`` function in the **Low-Level API** for a legacy function that supports this." ``Optional`` Args: :export_dir (str, optional): The directory path to store ONNX-graph. @@ -393,7 +392,7 @@ class QEffAutoModel(QEFFTransformersBase): from QEfficient import QEffAutoModel model = QEffAutoModel.from_pretrained(model_name, num_hidden_layers=2) - model.compile(prefill_seq_len=32, ctx_len=1024) + model.compile() model.generate(prompts=["Hello, world!"]) """ @@ -402,12 +401,49 @@ class QEffAutoModel(QEFFTransformersBase): _pytorch_transforms = [CustomOpsTransform] _onnx_transforms = [FP16ClipTransform] - def __init__(self, model: nn.Module, continuous_batching: bool = False, **kwargs): + def __init__(self, model: nn.Module, **kwargs): super().__init__(model) self.model.config.use_cache = True self.num_layers = model.config.num_hidden_layers + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): + """ + This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel. + Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. + + Args: + :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory. + :args, kwargs: Additional arguments to pass to transformers.AutoModel. + + .. code-block:: python + + from QEfficient import QEFFAutoModel + + # Initialize the model using from_pretrained similar to transformers.AutoModel. + model = QEFFAutoModel.from_pretrained("BAAI/bge-small-en-v1.5") + + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=14, device_group=[0]) # Considering you have a Cloud AI 100 Standard SKU + + # You can now execute the model + model.generate(prompts=["Hi there!!"]) + """ + + self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + + return self def export(self, export_dir: Optional[str] = None) -> str: + """ + Exports the model to ``ONNX`` format using ``torch.onnx.export``. + + ``Optional`` Args: + does not any arguments. + + Returns: + :str: Path of the generated ``ONNX`` graph. + """ bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN @@ -437,6 +473,21 @@ def compile( num_cores: int = 16, # FIXME: Make this mandatory arg **compiler_options, ) -> str: + """ + This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. + If the model has not been exported yet, this method will handle the export process. + You can pass any other arguments that the `qaic-exec` takes as extra kwargs. + + ``Optional`` Args: + :onnx_path (str, optional): Path to pre-exported onnx model. + :compile_dir (str, optional): Path for saving the qpc generated. + :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``. + :batch_size (int, optional): Batch size. ``Defaults to 1``. + :num_cores (int): Number of cores used to compile the model. + Returns: + :str: Path of the compiled ``qpc`` package. + """ + specializations = [ {"batch_size": batch_size, "seq_len": seq_len}, ] @@ -459,6 +510,22 @@ def generate( runtime_ai100: bool = True, seq_len: int = constants.Constants.CTX_LEN, ) -> str: + """ + This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. + If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. + + ``Mandatory`` Args: + :prompts (List[str]): List of prompts to run the execution. + :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model + ``optional`` Args: + :runtime_ai100 (bool), optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. + + Returns: + :str: Output from the ``AI_100`` or ``PyTorch`` runtime. + """ + + # AI_100 runtime if runtime_ai100: if not isinstance(self.qpc_path, Path): raise TypeError("Please run compile API first!") @@ -466,12 +533,8 @@ def generate( return QEfficient.cloud_ai_100_exec_embed( tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id ) + # PyTorch runtime else: inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) return self.model(**inputs) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): - self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - - return self From 74ffc16f36cf2d5ac0ed0a8d23906302cda63e80 Mon Sep 17 00:00:00 2001 From: amitraj Date: Mon, 9 Dec 2024 13:39:40 +0530 Subject: [PATCH 06/17] Fix-1 Signed-off-by: amitraj --- .../generation/text_generation_inference.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index eaa62a926..9b65b80ee 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -174,7 +174,8 @@ def get_compilation_dims(qpc_path: str) -> Tuple[int, int, Optional[int]]: raise FileNotFoundError(f"expected specializations.json file at path, {qpc_base_path}") compilation_batch_size = int(data["specializations"][0]["batch_size"]) - compilation_ctx_len = int(data["specializations"][0]["ctx_len"]) + if compilation_ctx_len := data["specializations"][0].get("ctx_len", None): + compilation_ctx_len = int(data["specializations"][0]["ctx_len"]) if compilation_fbs := data["specializations"][0].get("full_batch_size", None): compilation_fbs = int(compilation_fbs) return compilation_batch_size, compilation_ctx_len, compilation_fbs @@ -349,25 +350,25 @@ def cloud_ai_100_exec_kv( def cloud_ai_100_exec_embed( tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], - prompt: List[str], qpc_path: str, - device_id: List[int] = [0], + prompt: List[str], + device_id: List[int] = [0], ): session = QAICInferenceSession(qpc_path, device_ids=device_id) + batch_size = session.bindings[0].dims[0] seq_len = session.bindings[0].dims[1] inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) - prefill_inputs = dict( + inputs = dict( input_ids=inputs["input_ids"].numpy(), attention_mask=inputs["attention_mask"].numpy(), ) - prefill_logits = { - "output": np.random.randn(1, seq_len, session.bindings[2].dims[2]).astype(np.float32), + output = { + "output": np.random.randn(batch_size, seq_len, session.bindings[2].dims[2]).astype(np.float32), } - session.set_buffers(prefill_logits) - prefill_outputs = session.run(prefill_inputs) - return prefill_outputs - + session.set_buffers(output) + outputs = session.run(inputs) + return outputs class QEffTextGenerationBase: def __init__( From 2fb41ad108da8cad886617b4a8f56117d942b651 Mon Sep 17 00:00:00 2001 From: amitraj Date: Mon, 9 Dec 2024 14:38:40 +0530 Subject: [PATCH 07/17] Comments Addressed-1 Signed-off-by: amitraj --- .../generation/text_generation_inference.py | 25 +++++- .../transformers/models/modeling_auto.py | 88 ++++++++++--------- QEfficient/utils/constants.py | 2 +- .../models/test_causal_lm_models.py | 45 ++++++++-- 4 files changed, 105 insertions(+), 55 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 9b65b80ee..5fa55ed9a 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -174,8 +174,7 @@ def get_compilation_dims(qpc_path: str) -> Tuple[int, int, Optional[int]]: raise FileNotFoundError(f"expected specializations.json file at path, {qpc_base_path}") compilation_batch_size = int(data["specializations"][0]["batch_size"]) - if compilation_ctx_len := data["specializations"][0].get("ctx_len", None): - compilation_ctx_len = int(data["specializations"][0]["ctx_len"]) + compilation_ctx_len = int(data["specializations"][0]["ctx_len"]) if compilation_fbs := data["specializations"][0].get("full_batch_size", None): compilation_fbs = int(compilation_fbs) return compilation_batch_size, compilation_ctx_len, compilation_fbs @@ -352,8 +351,24 @@ def cloud_ai_100_exec_embed( tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], qpc_path: str, prompt: List[str], - device_id: List[int] = [0], -): + device_id: List[int] = [0], +) -> dict: + """ + This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. + If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. + + ``Mandatory`` Args: + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. + :qpc_path (str): Path to the saved generated binary file after compilation. + :prompt (str): Sample prompt for the model text generation. + ``Optional`` Args: + :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``. + + Returns: + :dict: Output from the ``AI_100`` runtime. + """ + session = QAICInferenceSession(qpc_path, device_ids=device_id) batch_size = session.bindings[0].dims[0] seq_len = session.bindings[0].dims[1] @@ -368,8 +383,10 @@ def cloud_ai_100_exec_embed( } session.set_buffers(output) outputs = session.run(inputs) + session.deactivate() return outputs + class QEffTextGenerationBase: def __init__( self, diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 9cdfd47fc..05b62dcb6 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -66,22 +66,6 @@ def model_name(self) -> str: mname = mname[4:] return mname - @property - def model_hash(self) -> str: - # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path. - # Using same card name will result in same hash. But, using a relative path for one run and - # absolute path for another run will result in different hash. - # The added complexity to resolve different paths to same location is not worth pursuing. - # Instead, advise the user to always provide same relative paths or absolute paths for local models. - - # Compute the hash with: model_config, transforms - mhash = hashlib.sha256() - mhash.update(to_hashable(self.model.config.to_diff_dict())) - mhash.update(to_hashable(self._transform_names())) - mhash.update(to_hashable({"is_tlm": self.is_tlm})) - mhash = mhash.hexdigest()[:16] - return mhash - class QEFFAutoModelForCausalLM(QEFFTransformersBase): """ @@ -349,8 +333,9 @@ def generate( self, tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], prompts: List[str], - device_id: List[int] = None, - runtime: str = "AI_100", + device_id: List[int] = [0], + runtime_ai100: bool = True, + seq_len: int = constants.Constants.CTX_LEN, **kwargs, ): """ @@ -362,21 +347,24 @@ def generate( :prompts (List[str]): List of prompts to run the execution. :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model ``optional`` Args: - :runtime (str, optional): Only ``AI_100`` runtime is supported as of now; ``ONNXRT`` and ``PyTorch`` coming soon. Defaults to "AI_100". + :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. + """ - if runtime != "AI_100": - raise ValueError("Only AI_100 runtime is supported right now via generate API") - if not isinstance(self.qpc_path, Path): - raise TypeError("Please run compile API first!") - generation_len = kwargs.pop("generation_len", None) - return QEfficient.cloud_ai_100_exec_kv( - tokenizer, - self.qpc_path, - prompt=prompts, - device_id=device_id, - generation_len=generation_len, - is_tlm=self.is_tlm, - ) + if runtime_ai100: + if not isinstance(self.qpc_path, Path): + raise TypeError("Please run compile API first!") + generation_len = kwargs.pop("generation_len", None) + return QEfficient.cloud_ai_100_exec_kv( + tokenizer, + self.qpc_path, + prompt=prompts, + device_id=device_id, + generation_len=generation_len, + is_tlm=self.is_tlm, + ) + else: + inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len) + return self.model(**inputs) class QEffAutoModel(QEFFTransformersBase): @@ -405,7 +393,7 @@ def __init__(self, model: nn.Module, **kwargs): super().__init__(model) self.model.config.use_cache = True self.num_layers = model.config.num_hidden_layers - + @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ @@ -429,11 +417,26 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # You can now execute the model model.generate(prompts=["Hi there!!"]) """ - + self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) return self + @property + def model_hash(self) -> str: + # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path. + # Using same card name will result in same hash. But, using a relative path for one run and + # absolute path for another run will result in different hash. + # The added complexity to resolve different paths to same location is not worth pursuing. + # Instead, advise the user to always provide same relative paths or absolute paths for local models. + + # Compute the hash with: model_config, transforms + mhash = hashlib.sha256() + mhash.update(to_hashable(self.model.config.to_diff_dict())) + mhash.update(to_hashable(self._transform_names())) + mhash = mhash.hexdigest()[:16] + return mhash + def export(self, export_dir: Optional[str] = None) -> str: """ Exports the model to ``ONNX`` format using ``torch.onnx.export``. @@ -470,7 +473,9 @@ def compile( *, seq_len: int = 32, batch_size: int = 1, + num_devices: int = 1, num_cores: int = 16, # FIXME: Make this mandatory arg + mxfp6_matmul: bool = False, **compiler_options, ) -> str: """ @@ -498,6 +503,8 @@ def compile( compile_only=True, specializations=specializations, convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, aic_num_cores=num_cores, **compiler_options, ) @@ -505,11 +512,11 @@ def compile( def generate( self, tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], - prompt: List[str], + prompts: List[str], device_id: List[int] = [0], runtime_ai100: bool = True, seq_len: int = constants.Constants.CTX_LEN, - ) -> str: + ) -> dict: """ This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. @@ -519,10 +526,10 @@ def generate( :prompts (List[str]): List of prompts to run the execution. :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model ``optional`` Args: - :runtime_ai100 (bool), optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. + :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. Returns: - :str: Output from the ``AI_100`` or ``PyTorch`` runtime. + :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. """ # AI_100 runtime @@ -531,10 +538,9 @@ def generate( raise TypeError("Please run compile API first!") return QEfficient.cloud_ai_100_exec_embed( - tokenizer=tokenizer, prompt=prompt, qpc_path=self.qpc_path, device_id=device_id + tokenizer=tokenizer, prompt=prompts, qpc_path=self.qpc_path, device_id=device_id ) # PyTorch runtime else: - inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) + inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len) return self.model(**inputs) - diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 4a3ba3ff3..5e3a29072 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -47,7 +47,7 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32 ONNX_EXPORT_EXAMPLE_FBS = 4 ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep -ONNX_EXPORT_OPSET = 13 +ONNX_EXPORT_OPSET = 14 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"] diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index b765ae609..be17732a7 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -15,7 +15,7 @@ from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers -from QEfficient.utils import hf_download +from QEfficient.utils import hf_download, padding_check_and_fix from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.constants import Constants from QEfficient.utils.device_utils import get_available_device_id @@ -192,13 +192,26 @@ def check_embed_pytorch_vs_ort_vs_ai100( # Try to initialize with add_pooling_layer parameter try: - qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path, add_pooling_layer=False) + qeff_model = QEffAutoModel.from_pretrained( + pretrained_model_name_or_path=model_path, + add_pooling_layer=False, + num_hidden_layers=n_layer, + attn_implementation="eager", + trust_remote_code=True, + ) except TypeError: # If it fails, initialize without the parameter - qeff_model = QEffAutoModel.from_pretrained(pretrained_model_name_or_path=model_path) - text = "My name is" + qeff_model = QEffAutoModel.from_pretrained( + pretrained_model_name_or_path=model_path, + num_hidden_layers=n_layer, + attn_implementation="eager", + trust_remote_code=True, + ) + + prompt = "My name is" tokenizer = AutoTokenizer.from_pretrained(model_name) - inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=seq_len) + padding_check_and_fix(tokenizer) + inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False) @@ -214,7 +227,7 @@ def check_embed_pytorch_vs_ort_vs_ai100( onnx_embeddings = onnx_outputs[0] mad = np.mean(np.abs(pt_embeddings - onnx_embeddings)) print("Mad for onnx and pytorch is ", mad) - assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}" + assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" qeff_model.compile( num_cores=14, @@ -224,7 +237,7 @@ def check_embed_pytorch_vs_ort_vs_ai100( # Compare ONNX and AI 100 outputs mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0])) print("Mad for onnx and AI 100 output is ", mad) - assert mad <= 10**-2, f"MAD is too high for onnx and Pytorch: {mad}" + assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" # FIXME: there should be a CB test here @@ -302,7 +315,21 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) +embed_test_models = [ + # model_name, architecture + "nomic-ai/nomic-embed-text-v1.5", # NomicBertModel + "sentence-transformers/multi-qa-mpnet-base-cos-v1", # MPNetForMaskedLM + "BAAI/bge-reranker-v2-m3", # XLMRobertaForSequenceClassification + "BAAI/bge-small-en-v1.5", # BertModel + # "intfloat/e5-mistral-7b-instruct", # MistralModel + # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM +] + + @pytest.mark.on_qaic -def test_embed_model_pytorch_vs_onnx_vs_ai100(): - model_name = "BAAI/bge-small-en-v1.5" +@pytest.mark.parametrize("model_name", embed_test_models) +def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name): + """ + Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. + """ check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1) From 262f45ebb4c21e793aad326b03017bd77b63b23e Mon Sep 17 00:00:00 2001 From: amitraj Date: Sun, 15 Dec 2024 11:36:11 +0530 Subject: [PATCH 08/17] Comments addressed-2 Signed-off-by: amitraj --- .../generation/text_generation_inference.py | 102 +++++++++++++++--- .../transformers/models/modeling_auto.py | 21 ++-- .../models/test_causal_lm_models.py | 82 +------------- .../models/test_embedding_models.py | 95 ++++++++++++++++ 4 files changed, 196 insertions(+), 104 deletions(-) create mode 100644 tests/transformers/models/test_embedding_models.py diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 5fa55ed9a..40328f725 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -350,8 +350,9 @@ def cloud_ai_100_exec_kv( def cloud_ai_100_exec_embed( tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], qpc_path: str, - prompt: List[str], + prompts: List[str], device_id: List[int] = [0], + enable_debug_logs: bool = False, ) -> dict: """ This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. @@ -368,23 +369,14 @@ def cloud_ai_100_exec_embed( Returns: :dict: Output from the ``AI_100`` runtime. """ - - session = QAICInferenceSession(qpc_path, device_ids=device_id) - batch_size = session.bindings[0].dims[0] - seq_len = session.bindings[0].dims[1] - inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) - - inputs = dict( - input_ids=inputs["input_ids"].numpy(), - attention_mask=inputs["attention_mask"].numpy(), + generate_feature=FeatureGeneration( + tokenizer=tokenizer, + qpc_path=qpc_path, + device_id=device_id, + enable_debug_logs=enable_debug_logs, ) - output = { - "output": np.random.randn(batch_size, seq_len, session.bindings[2].dims[2]).astype(np.float32), - } - session.set_buffers(output) - outputs = session.run(inputs) - session.deactivate() - return outputs + + return generate_feature.generate(prompts=prompts) class QEffTextGenerationBase: @@ -406,6 +398,7 @@ def __init__( # Load QPC self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs) + # Fetch the variables from the QPC self._vocab_size = self._fetch_vocab_size() # Fetch Vocab size self.batch_size, self._prefill_seq_len = self._fetch_batch_size_prefill_seq_len() @@ -1110,3 +1103,78 @@ def generate( perf_metrics=perf_metrics, ) return latency_stats + +class QEffFeatureGenerationBase: + + def __init__( + self, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + qpc_path: str, + ctx_len: Optional[int] = None, + device_id: Optional[List[int]] = None, + enable_debug_logs: bool = False, + ) -> None: + self.ctx_len = ctx_len + + # Load QPC + self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs) + + self._batch_size = self._session.bindings[0].dims[0] + self._seq_len = self._session.bindings[0].dims[1] + + self.tokenizer = tokenizer + self._set_tokenizer_params() # set tokenizer params + + def _set_tokenizer_params(self): + """ + Sets the tokenizer parameters for the model. + """ + if self.tokenizer.padding_side != "right": + logger.warning("Please use padding_side='right' while initializing the tokenizer") + self.tokenizer.padding_side = "right" + if self.tokenizer.pad_token_id is None: + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + + +class FeatureGeneration: + def __init__( + self, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + qpc_path: str, + seq_len: Optional[int] = None, + device_id: Optional[List[int]] = None, + enable_debug_logs: bool = False, + ) -> None: + + self._qaic_model = QEffFeatureGenerationBase( + tokenizer, qpc_path, seq_len, device_id, enable_debug_logs + ) + self._batch_size = self._qaic_model._batch_size + self._tokenizer = self._qaic_model.tokenizer + self._seq_len = self._qaic_model._seq_len + self._session = self._qaic_model._session + def generate( + self, + prompts: List[str] + ): + outputs = [] + + for prompt in prompts: + inputs = self._tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=self._seq_len) + + inputs = dict( + input_ids=inputs["input_ids"].numpy(), + attention_mask=inputs["attention_mask"].numpy(), + ) + output = { + "output": np.random.randn(self._batch_size, self._seq_len, self._session.bindings[2].dims[2]).astype( + np.float32 + ), + } + self._session.set_buffers(output) + output = self._session.run(inputs) + outputs.append(output) + return outputs + + + diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 05b62dcb6..027ca54eb 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -335,7 +335,6 @@ def generate( prompts: List[str], device_id: List[int] = [0], runtime_ai100: bool = True, - seq_len: int = constants.Constants.CTX_LEN, **kwargs, ): """ @@ -363,8 +362,7 @@ def generate( is_tlm=self.is_tlm, ) else: - inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len) - return self.model(**inputs) + raise ValueError("Only AI_100 runtime is supported right now via generate API") class QEffAutoModel(QEFFTransformersBase): @@ -395,6 +393,7 @@ def __init__(self, model: nn.Module, **kwargs): self.num_layers = model.config.num_hidden_layers @classmethod + @with_replaced_quantizers def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel. @@ -417,10 +416,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # You can now execute the model model.generate(prompts=["Hi there!!"]) """ + if kwargs.get("attn_implementation", None) not in {None, "eager"}: + logger.warning('Updating attn_implementation="eager"') - self = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + if kwargs.get("low_cpu_mem_usage", None): + logger.warning("Updating low_cpu_mem_usage=False") - return self + kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False}) + + try: + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + except TypeError: + kwargs.pop("add_pooling_layers", None) + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + return cls(model) @property def model_hash(self) -> str: @@ -538,7 +547,7 @@ def generate( raise TypeError("Please run compile API first!") return QEfficient.cloud_ai_100_exec_embed( - tokenizer=tokenizer, prompt=prompts, qpc_path=self.qpc_path, device_id=device_id + tokenizer=tokenizer, prompts=prompts, qpc_path=self.qpc_path, device_id=device_id ) # PyTorch runtime else: diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index be17732a7..629828d55 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -179,67 +179,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( ] ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." - -def check_embed_pytorch_vs_ort_vs_ai100( - model_name: str, - seq_len: int = Constants.CTX_LEN, - n_layer: int = 1, -): - model_path = hf_download( - repo_id=model_name, - ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], - ) - - # Try to initialize with add_pooling_layer parameter - try: - qeff_model = QEffAutoModel.from_pretrained( - pretrained_model_name_or_path=model_path, - add_pooling_layer=False, - num_hidden_layers=n_layer, - attn_implementation="eager", - trust_remote_code=True, - ) - except TypeError: - # If it fails, initialize without the parameter - qeff_model = QEffAutoModel.from_pretrained( - pretrained_model_name_or_path=model_path, - num_hidden_layers=n_layer, - attn_implementation="eager", - trust_remote_code=True, - ) - - prompt = "My name is" - tokenizer = AutoTokenizer.from_pretrained(model_name) - padding_check_and_fix(tokenizer) - inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) - - pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompt="My name is", runtime_ai100=False) - - onnx_model = qeff_model.export() - ort_session = ort.InferenceSession(str(onnx_model)) - # Prepare the inputs for ONNX Runtime - onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()} - # Run inference - onnx_outputs = ort_session.run(None, onnx_inputs) - - # Compare PyTorch and ONNX outputs - pt_embeddings = pt_outputs[0].detach().numpy() - onnx_embeddings = onnx_outputs[0] - mad = np.mean(np.abs(pt_embeddings - onnx_embeddings)) - print("Mad for onnx and pytorch is ", mad) - assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" - - qeff_model.compile( - num_cores=14, - ) - ai100_output = qeff_model.generate(tokenizer=tokenizer, prompt=["My name is"]) - - # Compare ONNX and AI 100 outputs - mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0])) - print("Mad for onnx and AI 100 output is ", mad) - assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" - - # FIXME: there should be a CB test here @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x) def test_causal_lm_export_with_deprecated_api(model_name): @@ -313,23 +252,4 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): prompt_len = 1 check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) - - -embed_test_models = [ - # model_name, architecture - "nomic-ai/nomic-embed-text-v1.5", # NomicBertModel - "sentence-transformers/multi-qa-mpnet-base-cos-v1", # MPNetForMaskedLM - "BAAI/bge-reranker-v2-m3", # XLMRobertaForSequenceClassification - "BAAI/bge-small-en-v1.5", # BertModel - # "intfloat/e5-mistral-7b-instruct", # MistralModel - # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM -] - - -@pytest.mark.on_qaic -@pytest.mark.parametrize("model_name", embed_test_models) -def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name): - """ - Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. - """ - check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1) + \ No newline at end of file diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py new file mode 100644 index 000000000..fe3ca7d62 --- /dev/null +++ b/tests/transformers/models/test_embedding_models.py @@ -0,0 +1,95 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + + +import numpy as np +import onnxruntime as ort +import pytest +from transformers import AutoTokenizer + +from QEfficient.transformers.models.modeling_auto import QEffAutoModel +from QEfficient.utils import hf_download, padding_check_and_fix +from QEfficient.utils.constants import Constants + +embed_test_models = [ + # model_name, architecture + "nomic-ai/nomic-embed-text-v1.5", # NomicBertModel + "sentence-transformers/multi-qa-mpnet-base-cos-v1", # MPNetForMaskedLM + "BAAI/bge-reranker-v2-m3", # XLMRobertaForSequenceClassification + "BAAI/bge-small-en-v1.5", # BertModel + # "intfloat/e5-mistral-7b-instruct", # MistralModel + # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM +] + +def check_embed_pytorch_vs_ort_vs_ai100( + model_name: str, + seq_len: int = Constants.CTX_LEN, + n_layer: int = 1, +): + model_path = hf_download( + repo_id=model_name, + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + + # Try to initialize with add_pooling_layer parameter + try: + qeff_model = QEffAutoModel.from_pretrained( + pretrained_model_name_or_path=model_path, + add_pooling_layer=False, + num_hidden_layers=n_layer, + attn_implementation="eager", + trust_remote_code=True, + ) + except TypeError: + # If it fails, initialize without the parameter + qeff_model = QEffAutoModel.from_pretrained( + pretrained_model_name_or_path=model_path, + num_hidden_layers=n_layer, + attn_implementation="eager", + trust_remote_code=True, + ) + + prompt = "My name is" + tokenizer = AutoTokenizer.from_pretrained(model_name) + padding_check_and_fix(tokenizer) + inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) + + pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompts=["My name is"], runtime_ai100=False) + + onnx_model = qeff_model.export() + ort_session = ort.InferenceSession(str(onnx_model)) + # Prepare the inputs for ONNX Runtime + onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()} + # Run inference + onnx_outputs = ort_session.run(None, onnx_inputs) + + # Compare PyTorch and ONNX outputs + pt_embeddings = pt_outputs[0][0].detach().numpy() + onnx_embeddings = onnx_outputs[0] + mad = np.mean(np.abs(pt_embeddings - onnx_embeddings)) + print("Mad for onnx and pytorch is ", mad) + assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}" + + qeff_model.compile( + num_cores=14, + ) + ai100_output = qeff_model.generate(tokenizer=tokenizer, prompts=["My name is"]) + + # Compare ONNX and AI 100 outputs + mad = np.mean(np.abs(ai100_output[0]["output"] - onnx_outputs[0])) + print("Mad for onnx and AI 100 output is ", mad) + assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" + + + +@pytest.mark.on_qaic +@pytest.mark.parametrize("model_name", embed_test_models) +def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name): + """ + Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. + """ + check_embed_pytorch_vs_ort_vs_ai100(model_name=model_name, seq_len=32, n_layer=1) From ba0258b69999ec8e8dc3c0d20ee709f6c066a0da Mon Sep 17 00:00:00 2001 From: amitraj Date: Sun, 15 Dec 2024 11:37:35 +0530 Subject: [PATCH 09/17] Lint and formatted Signed-off-by: amitraj --- .../generation/text_generation_inference.py | 37 +++++++------------ .../models/test_causal_lm_models.py | 9 ++--- .../models/test_embedding_models.py | 2 +- 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 40328f725..6d6377ed2 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -369,13 +369,13 @@ def cloud_ai_100_exec_embed( Returns: :dict: Output from the ``AI_100`` runtime. """ - generate_feature=FeatureGeneration( + generate_feature = FeatureGeneration( tokenizer=tokenizer, qpc_path=qpc_path, device_id=device_id, enable_debug_logs=enable_debug_logs, ) - + return generate_feature.generate(prompts=prompts) @@ -398,7 +398,6 @@ def __init__( # Load QPC self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs) - # Fetch the variables from the QPC self._vocab_size = self._fetch_vocab_size() # Fetch Vocab size self.batch_size, self._prefill_seq_len = self._fetch_batch_size_prefill_seq_len() @@ -1103,9 +1102,9 @@ def generate( perf_metrics=perf_metrics, ) return latency_stats - + + class QEffFeatureGenerationBase: - def __init__( self, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], @@ -1115,16 +1114,16 @@ def __init__( enable_debug_logs: bool = False, ) -> None: self.ctx_len = ctx_len - + # Load QPC self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs) - + self._batch_size = self._session.bindings[0].dims[0] self._seq_len = self._session.bindings[0].dims[1] - + self.tokenizer = tokenizer self._set_tokenizer_params() # set tokenizer params - + def _set_tokenizer_params(self): """ Sets the tokenizer parameters for the model. @@ -1134,8 +1133,8 @@ def _set_tokenizer_params(self): self.tokenizer.padding_side = "right" if self.tokenizer.pad_token_id is None: self.tokenizer.pad_token_id = self.tokenizer.eos_token_id - - + + class FeatureGeneration: def __init__( self, @@ -1145,18 +1144,13 @@ def __init__( device_id: Optional[List[int]] = None, enable_debug_logs: bool = False, ) -> None: - - self._qaic_model = QEffFeatureGenerationBase( - tokenizer, qpc_path, seq_len, device_id, enable_debug_logs - ) + self._qaic_model = QEffFeatureGenerationBase(tokenizer, qpc_path, seq_len, device_id, enable_debug_logs) self._batch_size = self._qaic_model._batch_size self._tokenizer = self._qaic_model.tokenizer self._seq_len = self._qaic_model._seq_len - self._session = self._qaic_model._session - def generate( - self, - prompts: List[str] - ): + self._session = self._qaic_model._session + + def generate(self, prompts: List[str]): outputs = [] for prompt in prompts: @@ -1175,6 +1169,3 @@ def generate( output = self._session.run(inputs) outputs.append(output) return outputs - - - diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 629828d55..6e91711e0 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -8,14 +8,13 @@ from typing import Optional import numpy as np -import onnxruntime as ort import pytest -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers -from QEfficient.utils import hf_download, padding_check_and_fix +from QEfficient.utils import hf_download from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.constants import Constants from QEfficient.utils.device_utils import get_available_device_id @@ -179,6 +178,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( ] ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." + # FIXME: there should be a CB test here @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x) def test_causal_lm_export_with_deprecated_api(model_name): @@ -252,4 +252,3 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): prompt_len = 1 check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len) - \ No newline at end of file diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index fe3ca7d62..4feb622a9 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -25,6 +25,7 @@ # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM ] + def check_embed_pytorch_vs_ort_vs_ai100( model_name: str, seq_len: int = Constants.CTX_LEN, @@ -85,7 +86,6 @@ def check_embed_pytorch_vs_ort_vs_ai100( assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" - @pytest.mark.on_qaic @pytest.mark.parametrize("model_name", embed_test_models) def test_embed_model_pytorch_vs_onnx_vs_ai100(model_name): From ba66c759576de397899c78d0e1b0a46ecf51255e Mon Sep 17 00:00:00 2001 From: amitraj Date: Mon, 16 Dec 2024 10:23:27 +0530 Subject: [PATCH 10/17] Comments addressed-3 Signed-off-by: amitraj --- .../transformers/models/modeling_auto.py | 3 +-- pyproject.toml | 1 + .../models/test_embedding_models.py | 25 +++++-------------- 3 files changed, 8 insertions(+), 21 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 027ca54eb..fd8d0acf7 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -423,11 +423,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False}) - try: model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) except TypeError: - kwargs.pop("add_pooling_layers", None) + kwargs.pop("add_pooling_layer", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) return cls(model) diff --git a/pyproject.toml b/pyproject.toml index fbffbd317..a4d3c0be9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "numpy==1.23.0", "protobuf==3.20.2", "onnxscript==0.1.0.dev20240327", + "einops==0.8.0", "sympy", "torch==2.4.1; platform_machine=='aarch64'", # Specifying torch cpu package URL per python version, update the list once pytorch releases whl for python>3.11 diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 4feb622a9..ab3ba7ba4 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -21,8 +21,6 @@ "sentence-transformers/multi-qa-mpnet-base-cos-v1", # MPNetForMaskedLM "BAAI/bge-reranker-v2-m3", # XLMRobertaForSequenceClassification "BAAI/bge-small-en-v1.5", # BertModel - # "intfloat/e5-mistral-7b-instruct", # MistralModel - # "dunzhang/stella_en_1.5B_v5", # Qwen2ForCausalLM ] @@ -36,23 +34,12 @@ def check_embed_pytorch_vs_ort_vs_ai100( ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], ) - # Try to initialize with add_pooling_layer parameter - try: - qeff_model = QEffAutoModel.from_pretrained( - pretrained_model_name_or_path=model_path, - add_pooling_layer=False, - num_hidden_layers=n_layer, - attn_implementation="eager", - trust_remote_code=True, - ) - except TypeError: - # If it fails, initialize without the parameter - qeff_model = QEffAutoModel.from_pretrained( - pretrained_model_name_or_path=model_path, - num_hidden_layers=n_layer, - attn_implementation="eager", - trust_remote_code=True, - ) + qeff_model = QEffAutoModel.from_pretrained( + pretrained_model_name_or_path=model_path, + num_hidden_layers=n_layer, + attn_implementation="eager", + trust_remote_code=True, + ) prompt = "My name is" tokenizer = AutoTokenizer.from_pretrained(model_name) From 38a418647703f3cd60c911bfbf2d6ff0db599e6d Mon Sep 17 00:00:00 2001 From: amitraj Date: Mon, 16 Dec 2024 11:32:05 +0530 Subject: [PATCH 11/17] Fix-2 Signed-off-by: amitraj --- .../generation/text_generation_inference.py | 32 ++++++++++++++++++- .../transformers/models/modeling_auto.py | 8 +++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 6d6377ed2..e6f3ba3b0 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -17,7 +17,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from QEfficient.generation.cloud_infer import QAICInferenceSession -from QEfficient.utils import padding_check_and_fix +from QEfficient.utils import constants, padding_check_and_fix from QEfficient.utils.logging_utils import logger @@ -379,6 +379,36 @@ def cloud_ai_100_exec_embed( return generate_feature.generate(prompts=prompts) +def pytorch_feature_generate( + model, + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], + prompts: List[str], + seq_len: int = constants.Constants.CTX_LEN, +): + """ + Generates features from a list of text prompts using a PyTorch model and tokenizer. + + ``Mandatory`` Args: + model: The PyTorch model used for generating features. + tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): The tokenizer used to preprocess the prompts. + prompts (List[str]): A list of text prompts to be tokenized and processed. + ``Optional`` Args: + seq_len (int, optional): The maximum sequence length for tokenization. Defaults to constants.Constants.CTX_LEN. + + Returns: + List[torch.Tensor]: A list of output features generated by the model for each prompt. + """ + + outputs = [] + for prompt in prompts: + inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) + import ipdb + + ipdb.set_trace() + outputs.append(model(**inputs)) + return outputs + + class QEffTextGenerationBase: def __init__( self, diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index fd8d0acf7..efb73fc82 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -18,6 +18,7 @@ import QEfficient from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform +from QEfficient.generation.text_generation_inference import pytorch_feature_generate from QEfficient.transformers.models.pytorch_transforms import CustomOpsTransform, KVCacheTransform, SpDTransform from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform @@ -99,6 +100,10 @@ def __init__( is_tlm: bool = False, **kwargs, ): + model_class_name = model.__class__.__name__ + if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")): + raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}") + # TODO: remove from version 1.20 if kwargs.pop("full_batch_size", None): continuous_batching = True @@ -550,5 +555,4 @@ def generate( ) # PyTorch runtime else: - inputs = tokenizer(prompts, return_tensors="pt", padding="max_length", max_length=seq_len) - return self.model(**inputs) + return pytorch_feature_generate(model=self.model, tokenizer=tokenizer, prompts=prompts, seq_len=seq_len) From 4401fd63082f34ce3cb3957cb7152763eb5c482b Mon Sep 17 00:00:00 2001 From: amitraj Date: Tue, 17 Dec 2024 13:34:00 +0530 Subject: [PATCH 12/17] Comments addressed-4 Signed-off-by: amitraj --- QEfficient/__init__.py | 3 +- .../generation/text_generation_inference.py | 131 +----------------- .../transformers/models/modeling_auto.py | 84 +++++++++-- pyproject.toml | 1 - .../models/test_embedding_models.py | 15 +- 5 files changed, 83 insertions(+), 151 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 987399316..0f7f40483 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -8,7 +8,7 @@ from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader from QEfficient.compile.compile_helper import compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_embed, cloud_ai_100_exec_kv +from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv from QEfficient.peft import QEffAutoPeftModelForCausalLM from QEfficient.transformers.transform import transform @@ -21,7 +21,6 @@ "export", "compile", "cloud_ai_100_exec_kv", - "cloud_ai_100_exec_embed", "QEffAutoModel", "QEFFAutoModelForCausalLM", "QEffAutoPeftModelForCausalLM", diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index e6f3ba3b0..4ddd57ada 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -17,7 +17,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from QEfficient.generation.cloud_infer import QAICInferenceSession -from QEfficient.utils import constants, padding_check_and_fix +from QEfficient.utils import padding_check_and_fix from QEfficient.utils.logging_utils import logger @@ -347,68 +347,6 @@ def cloud_ai_100_exec_kv( return exec_info -def cloud_ai_100_exec_embed( - tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], - qpc_path: str, - prompts: List[str], - device_id: List[int] = [0], - enable_debug_logs: bool = False, -) -> dict: - """ - This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. - If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. - - ``Mandatory`` Args: - :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Model tokenizer. - :qpc_path (str): Path to the saved generated binary file after compilation. - :prompt (str): Sample prompt for the model text generation. - ``Optional`` Args: - :device_id (List[int]): Device IDs to be used for execution. If ``len(device_id) > 1``, it enables multiple card setup. If ``None``, auto-device-picker will be used. ``Defaults to None``. - - Returns: - :dict: Output from the ``AI_100`` runtime. - """ - generate_feature = FeatureGeneration( - tokenizer=tokenizer, - qpc_path=qpc_path, - device_id=device_id, - enable_debug_logs=enable_debug_logs, - ) - - return generate_feature.generate(prompts=prompts) - - -def pytorch_feature_generate( - model, - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - prompts: List[str], - seq_len: int = constants.Constants.CTX_LEN, -): - """ - Generates features from a list of text prompts using a PyTorch model and tokenizer. - - ``Mandatory`` Args: - model: The PyTorch model used for generating features. - tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): The tokenizer used to preprocess the prompts. - prompts (List[str]): A list of text prompts to be tokenized and processed. - ``Optional`` Args: - seq_len (int, optional): The maximum sequence length for tokenization. Defaults to constants.Constants.CTX_LEN. - - Returns: - List[torch.Tensor]: A list of output features generated by the model for each prompt. - """ - - outputs = [] - for prompt in prompts: - inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) - import ipdb - - ipdb.set_trace() - outputs.append(model(**inputs)) - return outputs - - class QEffTextGenerationBase: def __init__( self, @@ -1132,70 +1070,3 @@ def generate( perf_metrics=perf_metrics, ) return latency_stats - - -class QEffFeatureGenerationBase: - def __init__( - self, - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - qpc_path: str, - ctx_len: Optional[int] = None, - device_id: Optional[List[int]] = None, - enable_debug_logs: bool = False, - ) -> None: - self.ctx_len = ctx_len - - # Load QPC - self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs) - - self._batch_size = self._session.bindings[0].dims[0] - self._seq_len = self._session.bindings[0].dims[1] - - self.tokenizer = tokenizer - self._set_tokenizer_params() # set tokenizer params - - def _set_tokenizer_params(self): - """ - Sets the tokenizer parameters for the model. - """ - if self.tokenizer.padding_side != "right": - logger.warning("Please use padding_side='right' while initializing the tokenizer") - self.tokenizer.padding_side = "right" - if self.tokenizer.pad_token_id is None: - self.tokenizer.pad_token_id = self.tokenizer.eos_token_id - - -class FeatureGeneration: - def __init__( - self, - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - qpc_path: str, - seq_len: Optional[int] = None, - device_id: Optional[List[int]] = None, - enable_debug_logs: bool = False, - ) -> None: - self._qaic_model = QEffFeatureGenerationBase(tokenizer, qpc_path, seq_len, device_id, enable_debug_logs) - self._batch_size = self._qaic_model._batch_size - self._tokenizer = self._qaic_model.tokenizer - self._seq_len = self._qaic_model._seq_len - self._session = self._qaic_model._session - - def generate(self, prompts: List[str]): - outputs = [] - - for prompt in prompts: - inputs = self._tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=self._seq_len) - - inputs = dict( - input_ids=inputs["input_ids"].numpy(), - attention_mask=inputs["attention_mask"].numpy(), - ) - output = { - "output": np.random.randn(self._batch_size, self._seq_len, self._session.bindings[2].dims[2]).astype( - np.float32 - ), - } - self._session.set_buffers(output) - output = self._session.run(inputs) - outputs.append(output) - return outputs diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index efb73fc82..01237db2a 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -11,6 +11,7 @@ from pathlib import Path from typing import List, Optional, Union +import numpy as np import torch import torch.nn as nn from transformers import AutoModel, AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast @@ -18,11 +19,12 @@ import QEfficient from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform -from QEfficient.generation.text_generation_inference import pytorch_feature_generate +from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.transformers.models.pytorch_transforms import CustomOpsTransform, KVCacheTransform, SpDTransform from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform from QEfficient.utils import constants, get_padding_shape_from_config +from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.cache import to_hashable logger = logging.getLogger(__file__) @@ -145,7 +147,7 @@ def from_pretrained( model = QEFFAutoModelForCausalLM.from_pretrained("gpt2") # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=14, device_group=[0]) # Considering you have a Cloud AI 100 Standard SKU + model.compile(num_cores=6, device_group=[0]) # Considering you have a Cloud AI 100 Standard SKU # You can now execute the model model.generate(prompts=["Hi there!!"]) @@ -396,6 +398,7 @@ def __init__(self, model: nn.Module, **kwargs): super().__init__(model) self.model.config.use_cache = True self.num_layers = model.config.num_hidden_layers + self.tokenizer = load_hf_tokenizer(self.model.config.name_or_path) @classmethod @with_replaced_quantizers @@ -416,7 +419,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): model = QEFFAutoModel.from_pretrained("BAAI/bge-small-en-v1.5") # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=14, device_group=[0]) # Considering you have a Cloud AI 100 Standard SKU + model.compile(num_cores=14, device_group=[0]) # Considering you have a Cloud AI 100 SKU # You can now execute the model model.generate(prompts=["Hi there!!"]) @@ -524,9 +527,8 @@ def compile( def generate( self, - tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], prompts: List[str], - device_id: List[int] = [0], + device_ids: List[int] = [0], runtime_ai100: bool = True, seq_len: int = constants.Constants.CTX_LEN, ) -> dict: @@ -550,9 +552,73 @@ def generate( if not isinstance(self.qpc_path, Path): raise TypeError("Please run compile API first!") - return QEfficient.cloud_ai_100_exec_embed( - tokenizer=tokenizer, prompts=prompts, qpc_path=self.qpc_path, device_id=device_id - ) + return self.cloud_ai_100_feature_generate(prompts=prompts, device_ids=device_ids) # PyTorch runtime else: - return pytorch_feature_generate(model=self.model, tokenizer=tokenizer, prompts=prompts, seq_len=seq_len) + return self.pytorch_feature_generate(model=self.model, prompts=prompts, seq_len=seq_len) + + def cloud_ai_100_feature_generate( + self, + prompts: List[str], + device_ids: List[int] = [0], + ): + """ + Generates features using the QAICInferenceSession for a list of prompts. + + This function initializes a QAICInferenceSession if not already initialized, + tokenizes the input prompts, and generates output features using the session. + + Args: + prompts (List[str]): A list of input prompts to generate features for. + device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0]. + + Returns: + List[Dict[str, np.ndarray]]: A list of dictionaries containing the generated output features. + """ + if self.qpc_session is None: + self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) + self.batch_size = self.qpc_session.bindings[0].dims[0] + self.seq_len = self.qpc_session.bindings[0].dims[1] + outputs = [] + + for prompt in prompts: + inputs = self.tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=self.seq_len) + + inputs = dict( + input_ids=inputs["input_ids"].numpy(), + attention_mask=inputs["attention_mask"].numpy(), + ) + output = { + "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype( + np.float32 + ), + } + self.qpc_session.set_buffers(output) + output = self.qpc_session.run(inputs) + outputs.append(output) + return outputs + + def pytorch_feature_generate( + self, + model, + prompts: List[str], + seq_len: int = constants.Constants.CTX_LEN, + ): + """ + Generates features from a list of text prompts using a PyTorch model. + + ``Mandatory`` Args: + model: The PyTorch model used for generating features. + prompts (List[str]): A list of text prompts to be tokenized and processed. + ``Optional`` Args: + seq_len (int, optional): The maximum sequence length for tokenization. Defaults to constants.Constants.CTX_LEN. + + Returns: + List[torch.Tensor]: A list of output features generated by the model for each prompt. + """ + + outputs = [] + for prompt in prompts: + inputs = self.tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) + outputs.append(model(**inputs)) + return outputs diff --git a/pyproject.toml b/pyproject.toml index a4d3c0be9..fbffbd317 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,6 @@ dependencies = [ "numpy==1.23.0", "protobuf==3.20.2", "onnxscript==0.1.0.dev20240327", - "einops==0.8.0", "sympy", "torch==2.4.1; platform_machine=='aarch64'", # Specifying torch cpu package URL per python version, update the list once pytorch releases whl for python>3.11 diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index ab3ba7ba4..ed41c7349 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -9,15 +9,14 @@ import numpy as np import onnxruntime as ort import pytest -from transformers import AutoTokenizer from QEfficient.transformers.models.modeling_auto import QEffAutoModel -from QEfficient.utils import hf_download, padding_check_and_fix +from QEfficient.utils import hf_download +from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.constants import Constants embed_test_models = [ # model_name, architecture - "nomic-ai/nomic-embed-text-v1.5", # NomicBertModel "sentence-transformers/multi-qa-mpnet-base-cos-v1", # MPNetForMaskedLM "BAAI/bge-reranker-v2-m3", # XLMRobertaForSequenceClassification "BAAI/bge-small-en-v1.5", # BertModel @@ -42,15 +41,13 @@ def check_embed_pytorch_vs_ort_vs_ai100( ) prompt = "My name is" - tokenizer = AutoTokenizer.from_pretrained(model_name) - padding_check_and_fix(tokenizer) - inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) - - pt_outputs = qeff_model.generate(tokenizer=tokenizer, prompts=["My name is"], runtime_ai100=False) + pt_outputs = qeff_model.generate(prompts=["My name is"], runtime_ai100=False) onnx_model = qeff_model.export() ort_session = ort.InferenceSession(str(onnx_model)) # Prepare the inputs for ONNX Runtime + tokenizer = load_hf_tokenizer(model_path) + inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()} # Run inference onnx_outputs = ort_session.run(None, onnx_inputs) @@ -65,7 +62,7 @@ def check_embed_pytorch_vs_ort_vs_ai100( qeff_model.compile( num_cores=14, ) - ai100_output = qeff_model.generate(tokenizer=tokenizer, prompts=["My name is"]) + ai100_output = qeff_model.generate(prompts=["My name is"]) # Compare ONNX and AI 100 outputs mad = np.mean(np.abs(ai100_output[0]["output"] - onnx_outputs[0])) From 206c81a0bd60ede7f29e098b92c711e0706c7f11 Mon Sep 17 00:00:00 2001 From: amitraj Date: Tue, 17 Dec 2024 13:54:00 +0530 Subject: [PATCH 13/17] Minor fix-1 Signed-off-by: amitraj --- QEfficient/transformers/models/modeling_auto.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 01237db2a..ab87242e6 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -391,7 +391,7 @@ class QEffAutoModel(QEFFTransformersBase): """ _hf_auto_class = AutoModel - _pytorch_transforms = [CustomOpsTransform] + _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] _onnx_transforms = [FP16ClipTransform] def __init__(self, model: nn.Module, **kwargs): @@ -416,10 +416,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): from QEfficient import QEFFAutoModel # Initialize the model using from_pretrained similar to transformers.AutoModel. - model = QEFFAutoModel.from_pretrained("BAAI/bge-small-en-v1.5") + model = QEFFAutoModel.from_pretrained("model_name") # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=14, device_group=[0]) # Considering you have a Cloud AI 100 SKU + model.compile(num_cores=16, device_group=[0]) # Considering you have a Cloud AI 100 SKU # You can now execute the model model.generate(prompts=["Hi there!!"]) @@ -563,17 +563,15 @@ def cloud_ai_100_feature_generate( device_ids: List[int] = [0], ): """ - Generates features using the QAICInferenceSession for a list of prompts. + Generates features with list of prompts using AI 100 runtime. - This function initializes a QAICInferenceSession if not already initialized, - tokenizes the input prompts, and generates output features using the session. - - Args: + ``Mandatory`` Args: prompts (List[str]): A list of input prompts to generate features for. + ``Optional`` Args: device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0]. Returns: - List[Dict[str, np.ndarray]]: A list of dictionaries containing the generated output features. + List[Dict[np.ndarray]]: A list of dictionaries containing the generated output features. """ if self.qpc_session is None: self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) From 0f1f8bb4825da1189bcf16c32aa849ea03dcfaa6 Mon Sep 17 00:00:00 2001 From: amitraj Date: Wed, 18 Dec 2024 14:34:48 +0530 Subject: [PATCH 14/17] fix-major Signed-off-by: amitraj --- QEfficient/__init__.py | 4 +- QEfficient/base/__init__.py | 2 +- QEfficient/base/common.py | 2 +- .../transformers/models/modeling_auto.py | 111 ++++++++++-------- docs/source/hl_api.md | 7 +- .../models/test_embedding_models.py | 52 ++++++-- 6 files changed, 108 insertions(+), 70 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 0f7f40483..8e32a1e6e 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -5,7 +5,7 @@ # # ----------------------------------------------------------------------------- -from QEfficient.base import QEffAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader +from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader from QEfficient.compile.compile_helper import compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv @@ -21,7 +21,7 @@ "export", "compile", "cloud_ai_100_exec_kv", - "QEffAutoModel", + "QEFFAutoModel", "QEFFAutoModelForCausalLM", "QEffAutoPeftModelForCausalLM", "QEFFCommonLoader", diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py index 257051d97..86cff11c1 100644 --- a/QEfficient/base/__init__.py +++ b/QEfficient/base/__init__.py @@ -6,4 +6,4 @@ # ----------------------------------------------------------------------------- from QEfficient.base.common import QEFFCommonLoader # noqa: F401 -from QEfficient.transformers.models.modeling_auto import QEffAutoModel, QEFFAutoModelForCausalLM # noqa: F401 +from QEfficient.transformers.models.modeling_auto import QEFFAutoModel, QEFFAutoModelForCausalLM # noqa: F401 diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py index e42e74687..192294738 100644 --- a/QEfficient/base/common.py +++ b/QEfficient/base/common.py @@ -76,7 +76,7 @@ def __init__(self, *args: Any, **kwds: Any) -> None: @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> QEFFBaseModel: """ - Downloads HuggingFace model if already doesn't exist locally, returns QEffAutoModel object based on type of model. + Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model. """ if not os.path.isdir(pretrained_model_name_or_path): pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index ab87242e6..9233ed563 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -372,9 +372,9 @@ def generate( raise ValueError("Only AI_100 runtime is supported right now via generate API") -class QEffAutoModel(QEFFTransformersBase): +class QEFFAutoModel(QEFFTransformersBase): """ - The QEffAutoModel class is designed for manipulating any transformer model from the HuggingFace hub. + The QEFFAutoModel class is designed for manipulating any transformer model from the HuggingFace hub. Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. ``Mandatory`` Args: @@ -382,12 +382,21 @@ class QEffAutoModel(QEFFTransformersBase): .. code-block:: python - from QEfficient import QEffAutoModel + from QEfficient import QEFFAutoModel + from transformers import AutoTokenizer - model = QEffAutoModel.from_pretrained(model_name, num_hidden_layers=2) - model.compile() + # Initialize the model using from_pretrained similar to transformers.AutoModel. + model = QEFFAutoModel.from_pretrained("model_name") - model.generate(prompts=["Hello, world!"]) + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=16, device_group=[0]) # Considering you have a Cloud AI 100 SKU + + #prepare input + tokenizer = AutoTokenizer.from_pretrained(model_name) + inputs = tokenizer("My name is", return_tensors="pt") + + # You can now execute the model + model.generate(inputs) """ _hf_auto_class = AutoModel @@ -414,6 +423,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): .. code-block:: python from QEfficient import QEFFAutoModel + from transformers import AutoTokenizer # Initialize the model using from_pretrained similar to transformers.AutoModel. model = QEFFAutoModel.from_pretrained("model_name") @@ -421,8 +431,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # Now you can directly compile the model for Cloud AI 100 model.compile(num_cores=16, device_group=[0]) # Considering you have a Cloud AI 100 SKU + #prepare input + tokenizer = AutoTokenizer.from_pretrained(model_name) + inputs = tokenizer("My name is", return_tensors="pt") + # You can now execute the model - model.generate(prompts=["Hi there!!"]) + model.generate(inputs) """ if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') @@ -458,7 +472,7 @@ def export(self, export_dir: Optional[str] = None) -> str: Exports the model to ``ONNX`` format using ``torch.onnx.export``. ``Optional`` Args: - does not any arguments. + :export_dir (str, optional): The directory path to store ONNX-graph. Returns: :str: Path of the generated ``ONNX`` graph. @@ -504,7 +518,11 @@ def compile( :compile_dir (str, optional): Path for saving the qpc generated. :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``. :batch_size (int, optional): Batch size. ``Defaults to 1``. + :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. :num_cores (int): Number of cores used to compile the model. + :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. + :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` Returns: :str: Path of the compiled ``qpc`` package. """ @@ -527,96 +545,85 @@ def compile( def generate( self, - prompts: List[str], + inputs: Union[torch.Tensor, np.ndarray], device_ids: List[int] = [0], runtime_ai100: bool = True, seq_len: int = constants.Constants.CTX_LEN, ) -> dict: """ - This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. - If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. - + This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. ``Mandatory`` Args: - :prompts (List[str]): List of prompts to run the execution. + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model ``optional`` Args: :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. - + :eq_len (int, optional): Sequence length for the inputs. Defaults to constants.Constants.CTX_LEN. Returns: :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. """ + # Prepare input + input_ids = torch.nn.functional.pad( + inputs["input_ids"], (0, seq_len - inputs["input_ids"].size(1)), "constant", 0 + ) + attention_mask = torch.nn.functional.pad( + inputs["attention_mask"], (0, seq_len - inputs["attention_mask"].size(1)), "constant", 0 + ) + + inputs = dict(input_ids=input_ids, attention_mask=attention_mask) # AI_100 runtime if runtime_ai100: if not isinstance(self.qpc_path, Path): raise TypeError("Please run compile API first!") - return self.cloud_ai_100_feature_generate(prompts=prompts, device_ids=device_ids) + return self.cloud_ai_100_feature_generate(inputs=inputs, device_ids=device_ids) # PyTorch runtime else: - return self.pytorch_feature_generate(model=self.model, prompts=prompts, seq_len=seq_len) + return self.pytorch_feature_generate(model=self.model, inputs=inputs) def cloud_ai_100_feature_generate( self, - prompts: List[str], + inputs: Union[torch.Tensor, np.ndarray], device_ids: List[int] = [0], ): """ Generates features with list of prompts using AI 100 runtime. ``Mandatory`` Args: - prompts (List[str]): A list of input prompts to generate features for. + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. ``Optional`` Args: device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0]. Returns: - List[Dict[np.ndarray]]: A list of dictionaries containing the generated output features. + np.ndarray: A list of dictionaries containing the generated output features. """ + if self.qpc_session is None: self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) self.batch_size = self.qpc_session.bindings[0].dims[0] self.seq_len = self.qpc_session.bindings[0].dims[1] - outputs = [] - for prompt in prompts: - inputs = self.tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=self.seq_len) - - inputs = dict( - input_ids=inputs["input_ids"].numpy(), - attention_mask=inputs["attention_mask"].numpy(), - ) - output = { - "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype( - np.float32 - ), - } - self.qpc_session.set_buffers(output) - output = self.qpc_session.run(inputs) - outputs.append(output) + inputs["input_ids"] = np.array(inputs["input_ids"]) + inputs["attention_mask"] = np.array(inputs["attention_mask"]) + outputs = { + "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype( + np.float32 + ), + } + self.qpc_session.set_buffers(outputs) + outputs = self.qpc_session.run(inputs) return outputs - def pytorch_feature_generate( - self, - model, - prompts: List[str], - seq_len: int = constants.Constants.CTX_LEN, - ): + def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]): """ Generates features from a list of text prompts using a PyTorch model. ``Mandatory`` Args: - model: The PyTorch model used for generating features. - prompts (List[str]): A list of text prompts to be tokenized and processed. - ``Optional`` Args: - seq_len (int, optional): The maximum sequence length for tokenization. Defaults to constants.Constants.CTX_LEN. + model: The transformed PyTorch model used for generating features. + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. Returns: - List[torch.Tensor]: A list of output features generated by the model for each prompt. + torch.Tensor: A list of output features generated by the model for each prompt. """ - outputs = [] - for prompt in prompts: - inputs = self.tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) - outputs.append(model(**inputs)) - return outputs + return model(**inputs) diff --git a/docs/source/hl_api.md b/docs/source/hl_api.md index 47dd6cde8..558965e76 100644 --- a/docs/source/hl_api.md +++ b/docs/source/hl_api.md @@ -8,7 +8,12 @@ :member-order: bysource :members: ``` - +## `QEFFAutoModel` +```{eval-rst} +.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel + :member-order: bysource + :members: +``` ## `QEffAutoPeftModelForCausalLM` ```{eval-rst} .. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index ed41c7349..2de882b9c 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -9,10 +9,11 @@ import numpy as np import onnxruntime as ort import pytest +import torch +from transformers import AutoModel, AutoTokenizer -from QEfficient.transformers.models.modeling_auto import QEffAutoModel +from QEfficient.transformers.models.modeling_auto import QEFFAutoModel from QEfficient.utils import hf_download -from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.constants import Constants embed_test_models = [ @@ -32,40 +33,65 @@ def check_embed_pytorch_vs_ort_vs_ai100( repo_id=model_name, ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], ) + # Prepare input + tokenizer = AutoTokenizer.from_pretrained(model_name) + inputs = tokenizer("My name is", return_tensors="pt") - qeff_model = QEffAutoModel.from_pretrained( - pretrained_model_name_or_path=model_path, + input_ids = torch.nn.functional.pad(inputs["input_ids"], (0, seq_len - inputs["input_ids"].size(1)), "constant", 0) + attention_mask = torch.nn.functional.pad( + inputs["attention_mask"], (0, seq_len - inputs["attention_mask"].size(1)), "constant", 0 + ) + inputs = dict(input_ids=input_ids, attention_mask=attention_mask) + + # Original PyTorch model + pt_model = AutoModel.from_pretrained( + model_path, num_hidden_layers=n_layer, attn_implementation="eager", trust_remote_code=True, ) - prompt = "My name is" - pt_outputs = qeff_model.generate(prompts=["My name is"], runtime_ai100=False) + pt_outputs = pt_model(**inputs) + pt_embeddings = pt_outputs[0][0].detach().numpy() + + # Pytorch transformed model + qeff_model = QEFFAutoModel.from_pretrained( + pretrained_model_name_or_path=model_path, + num_hidden_layers=n_layer, + attn_implementation="eager", + trust_remote_code=True, + ) + qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False) + qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy() + mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings)) + print("Mad for PyTorch and PyTorch transformed qeff_model is ", mad) + assert mad <= 0, f"MAD is too high for onnx and Pytorch: {mad}" onnx_model = qeff_model.export() ort_session = ort.InferenceSession(str(onnx_model)) + # Prepare the inputs for ONNX Runtime - tokenizer = load_hf_tokenizer(model_path) - inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", max_length=seq_len) - onnx_inputs = {"input_ids": inputs["input_ids"].numpy(), "attention_mask": inputs["attention_mask"].numpy()} + input_ids = np.array(input_ids) + attention_mask = np.array(attention_mask) + + onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask} # Run inference onnx_outputs = ort_session.run(None, onnx_inputs) - # Compare PyTorch and ONNX outputs + # Compare Transformed PyTorch and ONNX outputs pt_embeddings = pt_outputs[0][0].detach().numpy() onnx_embeddings = onnx_outputs[0] mad = np.mean(np.abs(pt_embeddings - onnx_embeddings)) - print("Mad for onnx and pytorch is ", mad) + print("Mad for onnx and PyTorch is ", mad) assert mad <= 10**-5, f"MAD is too high for onnx and Pytorch: {mad}" qeff_model.compile( num_cores=14, ) - ai100_output = qeff_model.generate(prompts=["My name is"]) + ai100_output = qeff_model.generate(inputs=inputs) # Compare ONNX and AI 100 outputs - mad = np.mean(np.abs(ai100_output[0]["output"] - onnx_outputs[0])) + mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0])) print("Mad for onnx and AI 100 output is ", mad) assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" From 6c9de4b414b4e2fa4076980119fee3f8ee3829c8 Mon Sep 17 00:00:00 2001 From: amitraj Date: Wed, 18 Dec 2024 16:02:06 +0530 Subject: [PATCH 15/17] fix-minor-2 Signed-off-by: amitraj --- QEfficient/base/modeling_qeff.py | 4 +- .../transformers/models/modeling_auto.py | 37 +++++++++---------- .../models/test_embedding_models.py | 29 +++------------ 3 files changed, 27 insertions(+), 43 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 064d7e6f0..82fc42215 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -251,7 +251,9 @@ def _compile( # Check if already compiled compile_hash = compile_hash.hexdigest()[:16] - qpc_path = qpc_path.with_name(qpc_path.name + "-" + compile_hash) + compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash) + qpc_path = compile_dir / "qpc" + qpc_path.mkdir(parents=True, exist_ok=True) if qpc_path.is_dir(): if (qpc_path / "programqpc.bin").is_file(): self.qpc_path = qpc_path diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 9233ed563..de21f070b 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -24,7 +24,6 @@ from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform from QEfficient.utils import constants, get_padding_shape_from_config -from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.cache import to_hashable logger = logging.getLogger(__file__) @@ -369,7 +368,7 @@ def generate( is_tlm=self.is_tlm, ) else: - raise ValueError("Only AI_100 runtime is supported right now via generate API") + raise NotImplementedError("Only AI_100 runtime is supported right now via generate API") class QEFFAutoModel(QEFFTransformersBase): @@ -401,13 +400,12 @@ class QEFFAutoModel(QEFFTransformersBase): _hf_auto_class = AutoModel _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] - _onnx_transforms = [FP16ClipTransform] + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__(self, model: nn.Module, **kwargs): super().__init__(model) self.model.config.use_cache = True self.num_layers = model.config.num_hidden_layers - self.tokenizer = load_hf_tokenizer(self.model.config.name_or_path) @classmethod @with_replaced_quantizers @@ -447,6 +445,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False}) try: model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + warnings.warn("Removing pooling layer from the model if exist") except TypeError: kwargs.pop("add_pooling_layer", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) @@ -545,10 +544,9 @@ def compile( def generate( self, - inputs: Union[torch.Tensor, np.ndarray], + inputs: torch.Tensor, device_ids: List[int] = [0], runtime_ai100: bool = True, - seq_len: int = constants.Constants.CTX_LEN, ) -> dict: """ This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. @@ -561,16 +559,6 @@ def generate( Returns: :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. """ - # Prepare input - input_ids = torch.nn.functional.pad( - inputs["input_ids"], (0, seq_len - inputs["input_ids"].size(1)), "constant", 0 - ) - attention_mask = torch.nn.functional.pad( - inputs["attention_mask"], (0, seq_len - inputs["attention_mask"].size(1)), "constant", 0 - ) - - inputs = dict(input_ids=input_ids, attention_mask=attention_mask) - # AI_100 runtime if runtime_ai100: if not isinstance(self.qpc_path, Path): @@ -583,7 +571,7 @@ def generate( def cloud_ai_100_feature_generate( self, - inputs: Union[torch.Tensor, np.ndarray], + inputs: torch.Tensor, device_ids: List[int] = [0], ): """ @@ -602,9 +590,19 @@ def cloud_ai_100_feature_generate( self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) self.batch_size = self.qpc_session.bindings[0].dims[0] self.seq_len = self.qpc_session.bindings[0].dims[1] + # Prepare input + input_ids_len = inputs["input_ids"].shape[1] + input_ids = np.array( + torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - inputs["input_ids"].size(1)), "constant", 0) + ) + attention_mask = np.array( + torch.nn.functional.pad( + inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0 + ) + ) + + inputs = dict(input_ids=input_ids, attention_mask=attention_mask) - inputs["input_ids"] = np.array(inputs["input_ids"]) - inputs["attention_mask"] = np.array(inputs["attention_mask"]) outputs = { "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype( np.float32 @@ -612,6 +610,7 @@ def cloud_ai_100_feature_generate( } self.qpc_session.set_buffers(outputs) outputs = self.qpc_session.run(inputs) + outputs = outputs["output"][:, :input_ids_len, :] return outputs def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]): diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 2de882b9c..1c2d5196c 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -9,11 +9,9 @@ import numpy as np import onnxruntime as ort import pytest -import torch from transformers import AutoModel, AutoTokenizer from QEfficient.transformers.models.modeling_auto import QEFFAutoModel -from QEfficient.utils import hf_download from QEfficient.utils.constants import Constants embed_test_models = [ @@ -29,23 +27,13 @@ def check_embed_pytorch_vs_ort_vs_ai100( seq_len: int = Constants.CTX_LEN, n_layer: int = 1, ): - model_path = hf_download( - repo_id=model_name, - ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], - ) # Prepare input tokenizer = AutoTokenizer.from_pretrained(model_name) inputs = tokenizer("My name is", return_tensors="pt") - input_ids = torch.nn.functional.pad(inputs["input_ids"], (0, seq_len - inputs["input_ids"].size(1)), "constant", 0) - attention_mask = torch.nn.functional.pad( - inputs["attention_mask"], (0, seq_len - inputs["attention_mask"].size(1)), "constant", 0 - ) - inputs = dict(input_ids=input_ids, attention_mask=attention_mask) - # Original PyTorch model pt_model = AutoModel.from_pretrained( - model_path, + model_name, num_hidden_layers=n_layer, attn_implementation="eager", trust_remote_code=True, @@ -53,14 +41,8 @@ def check_embed_pytorch_vs_ort_vs_ai100( pt_outputs = pt_model(**inputs) pt_embeddings = pt_outputs[0][0].detach().numpy() - # Pytorch transformed model - qeff_model = QEFFAutoModel.from_pretrained( - pretrained_model_name_or_path=model_path, - num_hidden_layers=n_layer, - attn_implementation="eager", - trust_remote_code=True, - ) + qeff_model = QEFFAutoModel(pt_model) qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False) qeff_pt_embeddings = qeff_pt_outputs[0][0].detach().numpy() mad = np.mean(np.abs(pt_embeddings - qeff_pt_embeddings)) @@ -71,14 +53,15 @@ def check_embed_pytorch_vs_ort_vs_ai100( ort_session = ort.InferenceSession(str(onnx_model)) # Prepare the inputs for ONNX Runtime - input_ids = np.array(input_ids) - attention_mask = np.array(attention_mask) + input_ids = np.array(inputs["input_ids"]) + attention_mask = np.array(inputs["attention_mask"]) onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask} # Run inference onnx_outputs = ort_session.run(None, onnx_inputs) # Compare Transformed PyTorch and ONNX outputs + pt_embeddings = pt_outputs[0][0].detach().numpy() onnx_embeddings = onnx_outputs[0] mad = np.mean(np.abs(pt_embeddings - onnx_embeddings)) @@ -91,7 +74,7 @@ def check_embed_pytorch_vs_ort_vs_ai100( ai100_output = qeff_model.generate(inputs=inputs) # Compare ONNX and AI 100 outputs - mad = np.mean(np.abs(ai100_output["output"] - onnx_outputs[0])) + mad = np.mean(np.abs(ai100_output - onnx_outputs[0])) print("Mad for onnx and AI 100 output is ", mad) assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" From 88e0fe653fe895996a5db38324b3a76fe2cb5b0b Mon Sep 17 00:00:00 2001 From: amitraj Date: Wed, 18 Dec 2024 17:33:12 +0530 Subject: [PATCH 16/17] fix-minor-3 Signed-off-by: amitraj --- QEfficient/transformers/models/modeling_auto.py | 7 +++---- scripts/Jenkinsfile | 12 ++++++------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index de21f070b..83c573f6d 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -547,7 +547,7 @@ def generate( inputs: torch.Tensor, device_ids: List[int] = [0], runtime_ai100: bool = True, - ) -> dict: + ) -> Union[torch.Tensor, np.ndarray]: """ This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. ``Mandatory`` Args: @@ -573,7 +573,7 @@ def cloud_ai_100_feature_generate( self, inputs: torch.Tensor, device_ids: List[int] = [0], - ): + ) -> np.ndarray: """ Generates features with list of prompts using AI 100 runtime. @@ -613,7 +613,7 @@ def cloud_ai_100_feature_generate( outputs = outputs["output"][:, :input_ids_len, :] return outputs - def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]): + def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]: """ Generates features from a list of text prompts using a PyTorch model. @@ -624,5 +624,4 @@ def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray Returns: torch.Tensor: A list of output features generated by the model for each prompt. """ - return model(**inputs) diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 2e6f17f4e..f1d37fe86 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -13,8 +13,8 @@ pipeline { steps { sh ''' . ~/.bashrc - docker run --privileged -dit --name ${BUILD_TAG} -v ./:/efficient-transformers -v ${HF_PATH}:${DOCKER_HF_PATH} ${DOCKER_LATEST}:master_latest - docker exec ${BUILD_TAG} bash -c " + sudo docker run --privileged -dit --name ${BUILD_TAG} -v ./:/efficient-transformers -v ${HF_PATH}:${DOCKER_HF_PATH} ${DOCKER_LATEST}:master_latest + sudo docker exec ${BUILD_TAG} bash -c " cd /efficient-transformers && apt update && apt install -y python3.10-venv && @@ -34,7 +34,7 @@ pipeline { steps { timeout(time: 10, unit: 'MINUTES') { sh ''' - docker exec ${BUILD_TAG} bash -c " + sudo docker exec ${BUILD_TAG} bash -c " cd /efficient-transformers && . preflight_qeff/bin/activate && mkdir -p $PWD/Non_cli_qaic && @@ -50,7 +50,7 @@ pipeline { steps { timeout(time: 60, unit: 'MINUTES') { sh ''' - docker exec ${BUILD_TAG} bash -c " + sudo docker exec ${BUILD_TAG} bash -c " cd /efficient-transformers && . preflight_qeff/bin/activate && mkdir -p $PWD/Non_qaic && @@ -68,7 +68,7 @@ pipeline { steps { timeout(time: 15, unit: 'MINUTES') { sh ''' - docker exec ${BUILD_TAG} bash -c " + sudo docker exec ${BUILD_TAG} bash -c " cd /efficient-transformers && . preflight_qeff/bin/activate && mkdir -p $PWD/cli && @@ -88,7 +88,7 @@ pipeline { script { try { sh ''' - docker rm -f ${BUILD_TAG} + sudo docker rm -f ${BUILD_TAG} sudo chown -R ubuntu . ''' } catch (error) { From 157142a274c821f34f5a53e1a6f60987ac4c5d1b Mon Sep 17 00:00:00 2001 From: amitraj Date: Wed, 18 Dec 2024 20:24:04 +0530 Subject: [PATCH 17/17] Update ONNX_EXPORT_OPSET to 13 Signed-off-by: amitraj --- QEfficient/utils/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 5e3a29072..4a3ba3ff3 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -47,7 +47,7 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32 ONNX_EXPORT_EXAMPLE_FBS = 4 ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep -ONNX_EXPORT_OPSET = 14 +ONNX_EXPORT_OPSET = 13 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]