Skip to content

[Tests]: Adding dummy causal models for testing in regular CI run #427

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions scripts/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ pipeline {
mkdir -p $PWD/Non_qaic &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Non_qaic &&
pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
pytest tests -m '(not cli) and (on_qaic) and (not nightly) and (not multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
junitparser merge tests/tests_log2.xml tests/tests_log.xml &&
deactivate"
'''
Expand Down Expand Up @@ -144,7 +144,7 @@ pipeline {
mkdir -p $PWD/Qnn_non_cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Qnn_non_cli &&
pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
pytest tests -m '(not cli) and (qnn) and (not nightly) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
junitparser merge tests/tests_log5.xml tests/tests_log.xml &&
deactivate"
'''
Expand Down
264 changes: 254 additions & 10 deletions tests/transformers/models/test_causal_lm_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
#
# -----------------------------------------------------------------------------

import copy
import os
from typing import Optional
from typing import List, Optional

import numpy as np
import pytest
import torch
from transformers import AutoModelForCausalLM
import torch.nn as nn
from transformers import AutoConfig, AutoModelForCausalLM

from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
Expand Down Expand Up @@ -49,17 +51,178 @@
"hpcai-tech/grok-1",
]

test_dummy_model_configs = [
# model_name, model_type, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "llama", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}),
("gpt2", "gpt2", 128, 1, 2, 64, 256, 50257, {"num_key_value_heads": 1}),
(
"Salesforce/codegen-350M-mono",
"codegen",
128,
1,
4,
64,
256,
51200,
{"num_key_value_heads": 1, "rotary_dim": 16},
),
# ("microsoft/Phi-3-mini-4k-instruct","phi3", 128, 1, 2, 64, 256, 32064, {}), ouput not matching
("tiiuae/falcon-7b", "falcon", 128, 1, 2, 64, 256, 65024, {"num_key_value_heads": 1}),
("Qwen/Qwen2-0.5B", "qwen2", 128, 1, 2, 64, 256, 151936, {"num_key_value_heads": 1}),
("bigcode/starcoder2-3b", "starcoder2", 128, 1, 2, 64, 256, 49152, {"num_key_value_heads": 1}),
("Felladrin/Minueza-32M-Base", "mistral", 128, 1, 2, 64, 256, 32002, {"num_key_value_heads": 1}),
("wtang06/mpt-125m-c4", "mpt", 128, 1, 2, 64, 256, 50368, {}),
("hakurei/gpt-j-random-tinier", "gptj", 128, 1, 2, 64, 256, 50400, {"num_key_value_heads": 1, "rotary_dim": 16}),
("mistralai/Mixtral-8x7B-Instruct-v0.1", "mixtral", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}),
(
"meta-llama/Llama-3.2-1B",
"llama",
128,
1,
2,
64,
256,
128256,
{
"num_key_value_heads": 1,
"rope_scaling": {
"factor": 32.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3",
},
},
),
(
"unsloth/gemma-2b",
"gemma",
128,
1,
2,
64,
256,
256000,
{"num_key_value_heads": 1, "_name_or_path": "unsloth/gemma-2b"},
),
(
"unsloth/gemma-2-2b",
"gemma2",
128,
1,
2,
64,
256,
256000,
{"num_key_value_heads": 1, "_name_or_path": "unsloth/gemma-2-2b"},
),
# ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", "llama", 128, 1, 2, 64, 256, 32003, {"num_key_value_heads": 1, "architectures": ["LlamaForCausalLM"], "pad_token_id": 0}),
# ("TheBloke/Llama-2-7B-GPTQ", "llama", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 2}),
(
"ibm-granite/granite-20b-code-base",
"gpt_bigcode",
128,
1,
2,
64,
256,
49152,
{"num_key_value_heads": 1, "activation_function": "gelu", "architectures": ["GPTBigCodeForCausalLM"]},
),
# ("neuralmagic/Llama-3.2-3B-Instruct-FP8", "llama", 128, 1, 2, 64, 256, 128256, {"num_key_value_heads": 2}),
# ("neuralmagic/Qwen2-0.5B-Instruct-FP8", "qwen2", 128, 1, 2, 64, 256, 151936, {"num_key_value_heads": 1,"quantization_config": {"activation_scheme": "static","ignored_layers": [ "lm_head" ],"quant_method": "fp8"}}),
# ("ibm-granite/granite-3.1-2b-instruct", "granite", 128, 1, 2, 64, 256, 49155, {"num_key_value_heads": 2}),
("ibm-granite/granite-guardian-3.1-2b", "granite", 128, 1, 2, 64, 256, 49155, {"num_key_value_heads": 1}),
]


def get_model_configs_and_names(configs: List[tuple]):
configs = [
(
AutoConfig.for_model(
model_type,
max_position_embeddings=max_position_embeddings,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
vocab_size=vocab_size,
**additional_params,
),
model_name,
)
for (
model_name,
model_type,
max_position_embeddings,
num_hidden_layers,
num_attention_heads,
hidden_size,
intermediate_size,
vocab_size,
additional_params,
) in configs
]
names = [y for (_, y) in configs]
return configs, names


test_dummy_model_configs, test_dummy_model_names = get_model_configs_and_names(test_dummy_model_configs)

test_models_qnn = [
"mistralai/Mixtral-8x7B-Instruct-v0.1",
"meta-llama/Llama-3.2-1B",
"unsloth/gemma-2b",
"ibm-granite/granite-guardian-3.1-2b",
]
test_dummy_model_configs_qnn = [
# model_name, model_type, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params
("mistralai/Mixtral-8x7B-Instruct-v0.1", "mixtral", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}),
(
"meta-llama/Llama-3.2-1B",
"llama",
128,
1,
2,
64,
256,
128256,
{
"num_key_value_heads": 1,
"rope_scaling": {
"factor": 32.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3",
},
},
),
(
"unsloth/gemma-2b",
"gemma",
128,
1,
2,
64,
256,
256000,
{"num_key_value_heads": 1, "_name_or_path": "unsloth/gemma-2b"},
),
("ibm-granite/granite-guardian-3.1-2b", "granite", 128, 1, 2, 64, 256, 49155, {"num_key_value_heads": 1}),
]
test_dummy_model_configs_qnn, test_dummy_model_names_qnn = get_model_configs_and_names(test_dummy_model_configs_qnn)

spd_test_models = [
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"Qwen/Qwen2-0.5B",
]
test_dummy_model_configs_spd = [
# model_name, model_type, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "llama", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}),
("Qwen/Qwen2-0.5B", "qwen2", 128, 1, 2, 64, 256, 151936, {"num_key_value_heads": 1}),
]
test_dummy_model_configs_spd, test_dummy_model_names_spd = get_model_configs_and_names(test_dummy_model_configs_spd)


def load_causal_lm_model(model_config):
Expand Down Expand Up @@ -101,6 +264,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
prefill_only: Optional[bool] = None,
enable_qnn: Optional[bool] = False,
qnn_config: Optional[str] = None,
model_hf: Optional[nn.Module] = None,
):
"""
Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
Expand All @@ -113,8 +277,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
replace_transformers_quantizers()
model_config = {"model_name": model_name}
model_config["n_layer"] = n_layer

model_hf, _ = load_causal_lm_model(model_config)
if model_hf is None:
model_hf, _ = load_causal_lm_model(model_config)
model_hf_cb = copy.deepcopy(model_hf)

tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
config = model_hf.config
Expand All @@ -129,16 +294,13 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
)

pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)

is_tlm = False if num_speculative_tokens is None else True
qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm, pretrained_model_name_or_path=model_name)

pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)

assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
"Tokens don't match for HF PyTorch model output and KV PyTorch model output"
)

onnx_model_path = qeff_model.export()
ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm)
gen_len = ort_tokens.shape[-1]
Expand All @@ -147,7 +309,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(

if not get_available_device_id():
pytest.skip("No available devices to run model on Cloud AI 100")

qpc_path = qeff_model.compile(
prefill_seq_len=prompt_len,
ctx_len=ctx_len,
Expand All @@ -174,8 +335,10 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
if prefill_only is not None:
return

# testing for CB models
model_hf, _ = load_causal_lm_model(model_config)
model_hf = model_hf_cb
model_hf.eval()
full_batch_size = 4
fbs_prompts = Constants.INPUT_STR * 4
api_runner = ApiRunner(
Expand Down Expand Up @@ -252,6 +415,28 @@ def test_causal_lm_export_with_deprecated_api(model_name):
)


@pytest.mark.on_qaic
@pytest.mark.regular
@pytest.mark.parametrize(
"test_dummy_model_config, test_dummy_model_name", test_dummy_model_configs, ids=test_dummy_model_names
)
def test_dummy_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(test_dummy_model_config, test_dummy_model_name):
"""
Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
``Mandatory`` Args:
:model_name (str): Hugging Face Model Card name, Example: ``gpt2``
"""

torch.manual_seed(42)
model_hf = AutoModelForCausalLM.from_config(
test_dummy_model_config,
attn_implementation="eager",
)
model_hf.eval()
check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(test_dummy_model_name, model_hf=model_hf)


@pytest.mark.nightly
@pytest.mark.on_qaic
@pytest.mark.parametrize("model_name", test_models_qaic)
def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
Expand All @@ -268,6 +453,36 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)


@pytest.mark.on_qaic
@pytest.mark.regular
@pytest.mark.qnn
@pytest.mark.parametrize(
"test_dummy_model_config_qnn, test_dummy_model_name_qnn",
test_dummy_model_configs_qnn,
ids=test_dummy_model_names_qnn,
)
def test_dummy_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(test_dummy_model_config_qnn, test_dummy_model_name_qnn):
"""
Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
``Mandatory`` Args:
:model_name (str): Hugging Face Model Card name, Example: ``gpt2``
"""

torch.manual_seed(42)
model_hf = AutoModelForCausalLM.from_config(
test_dummy_model_config_qnn,
attn_implementation="eager",
)
model_hf.eval()
qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)

check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
test_dummy_model_name_qnn, enable_qnn=True, qnn_config=qnn_config_json_path, model_hf=model_hf
)


@pytest.mark.nightly
@pytest.mark.on_qaic
@pytest.mark.qnn
@pytest.mark.parametrize("model_name", test_models_qnn)
Expand All @@ -292,6 +507,35 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):


@pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model
@pytest.mark.regular
@pytest.mark.on_qaic
@pytest.mark.qnn
@pytest.mark.parametrize(
"test_dummy_model_config_spd, test_dummy_model_name_spd",
test_dummy_model_configs_spd,
ids=test_dummy_model_names_spd,
)
def test_dummy_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(test_dummy_model_config_spd, test_dummy_model_name_spd):
"""
Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
``Mandatory`` Args:
:model_name (str): Hugging Face Model Card name, Example: ``gpt2``
"""

torch.manual_seed(42)
model_hf = AutoModelForCausalLM.from_config(
test_dummy_model_config_spd,
attn_implementation="eager",
)
model_hf.eval()

check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
model_name=test_dummy_model_name_spd, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS, model_hf=model_hf
)


@pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model
@pytest.mark.nightly
@pytest.mark.on_qaic
@pytest.mark.parametrize("model_name", spd_test_models)
def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
Expand All @@ -316,7 +560,7 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
"""
Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
"""
model_name = "gpt2"
model_name = "gpt2" # hf-internal-testing/tiny-random-gpt2
prompt_len = 1

check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
Expand Down
Loading