diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index e6a69d5fb..f560904f1 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -58,7 +58,7 @@ pipeline { mkdir -p $PWD/Non_qaic && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_qaic && - pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml && + pytest tests -m '(not cli) and (on_qaic) and (not nightly) and (not multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml && junitparser merge tests/tests_log2.xml tests/tests_log.xml && deactivate" ''' @@ -144,7 +144,7 @@ pipeline { mkdir -p $PWD/Qnn_non_cli && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Qnn_non_cli && - pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml && + pytest tests -m '(not cli) and (qnn) and (not nightly) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml && junitparser merge tests/tests_log5.xml tests/tests_log.xml && deactivate" ''' diff --git a/tests/cloud/conftest.py b/tests/cloud/conftest.py index 8b17297ac..a130bbdbe 100644 --- a/tests/cloud/conftest.py +++ b/tests/cloud/conftest.py @@ -5,7 +5,6 @@ # # ----------------------------------------------------------------------------- -import json import os import shutil @@ -145,165 +144,6 @@ def custom_io_file_path(self): return str(os.path.join(self.onnx_dir_path(), "custom_io_fp16.yaml")) -@pytest.fixture(scope="function") -def setup( - model_name, - num_cores, - prompt, - prompts_txt_file_path, - aic_enable_depth_first, - mos, - cache_dir, - hf_token, - batch_size, - prompt_len, - ctx_len, - mxfp6, - mxint8, - full_batch_size, - device_group, - enable_qnn, - qnn_config, -): - """ - It is a fixture or shared object of all testing script within or inner folder, - Args are coming from the dynamically generated tests method i.e, pytest_generate_tests via testing script or method - -------- - Args: same as set up initialization - Return: model_setup class object - """ - model_setup = ModelSetup( - model_name, - num_cores, - prompt, - prompts_txt_file_path, - bool(aic_enable_depth_first), - mos, - cache_dir, - hf_token, - batch_size, - prompt_len, - ctx_len, - bool(mxfp6), - bool(mxint8), - full_batch_size, - device_group, - enable_qnn, - qnn_config, - ) - - yield model_setup - del model_setup - - -def pytest_generate_tests(metafunc): - """ - pytest_generate_tests hook is used to create our own input parametrization, - It generates all the test cases of different combination of input parameters which are read from the json file, - and passed to each testing script module. - ----------- - Ref: https://docs.pytest.org/en/7.3.x/how-to/parametrize.html - """ - json_file = "tests/cloud/high_level_testing.json" - with open(json_file, "r") as file: - json_data = json.load(file) - - metafunc.parametrize("model_name", json_data["model_name"], ids=lambda x: "model_name=" + str(x)) - metafunc.parametrize("num_cores", json_data["num_cores"], ids=lambda x: "num_cores=" + str(x)) - metafunc.parametrize("prompt", json_data["prompt"], ids=lambda x: "prompt=" + str(x)) - metafunc.parametrize( - "prompts_txt_file_path", json_data["prompts_txt_file_path"], ids=lambda x: "prompts_txt_file_path=" + str(x) - ) - metafunc.parametrize( - "aic_enable_depth_first", json_data["aic_enable_depth_first"], ids=lambda x: "aic_enable_depth_first=" + str(x) - ) - metafunc.parametrize("mos", json_data["mos"], ids=lambda x: "mos=" + str(x)) - metafunc.parametrize("cache_dir", [None], ids=lambda x: "cache_dir=" + str(x)) - metafunc.parametrize("hf_token", json_data["hf_token"], ids=lambda x: "hf_token=" + str(x)) - metafunc.parametrize("batch_size", json_data["batch_size"], ids=lambda x: "batch_size=" + str(x)) - metafunc.parametrize("prompt_len", json_data["prompt_len"], ids=lambda x: "prompt_len=" + str(x)) - metafunc.parametrize("ctx_len", json_data["ctx_len"], ids=lambda x: "ctx_len=" + str(x)) - metafunc.parametrize("mxfp6", json_data["mxfp6"], ids=lambda x: "mxfp6=" + str(x)) - metafunc.parametrize("mxint8", json_data["mxint8"], ids=lambda x: "mxint8=" + str(x)) - metafunc.parametrize("full_batch_size", json_data["full_batch_size"], ids=lambda x: "full_batch_size=" + str(x)) - metafunc.parametrize("device_group", json_data["device_group"], ids=lambda x: "device_group=" + str(x)) - metafunc.parametrize("enable_qnn", json_data["enable_qnn"], ids=lambda x: "enable_qnn=" + str(x)) - metafunc.parametrize("qnn_config", json_data["qnn_config"], ids=lambda x: "qnn_config=" + str(x)) - - -def pytest_collection_modifyitems(config, items): - """ - pytest_collection_modifyitems is pytest a hook, - which is used to re-order the execution order of the testing script/methods - with various combination of inputs. - called after collection has been performed, may filter or re-order the items in-place. - Parameters: - items (List[_pytest.nodes.Item]) list of item objects - ---------- - Ref: https://docs.pytest.org/en/4.6.x/reference.html#collection-hooks - """ - run_first = ["test_export", "test_infer"] - modules_name = {item.module.__name__ for item in items} - cloud_modules = [] - non_cloud_modules = [] - for module in modules_name: - if module in run_first: - cloud_modules.append(module) - else: - non_cloud_modules.append(module) - - if len(cloud_modules) > 1: - modules = {item: item.module.__name__ for item in items} - items[:] = sorted(items, key=lambda x: run_first.index(modules[x]) if modules[x] in run_first else len(items)) - - non_cloud_tests = [] - - for itm in items: - if modules[itm] not in cloud_modules: - non_cloud_tests.append(itm) - - num_cloud_tests = len(items) - len(non_cloud_tests) - num_cloud_test_cases = num_cloud_tests // len(cloud_modules) - final_items = [] - - for i in range(num_cloud_test_cases): - for j in range(len(cloud_modules)): - final_items.append(items[i + j * num_cloud_test_cases]) - - final_items.extend(non_cloud_tests) - items[:] = final_items - - if config.getoption("--all"): - return - - first_model = items[0].callspec.params["model_name"] if hasattr(items[0], "callspec") else None - - for item in items: - if item.module.__name__ in ["test_export", "test_compile_and_execute", "test_infer"]: - if hasattr(item, "callspec"): - params = item.callspec.params - if not params["enable_qnn"] and params["qnn_config"] is not None: - item.add_marker( - pytest.mark.skip(reason="Skipping because same as enable_qnn = false and qnn_config = None") - ) - if params["enable_qnn"]: - item.add_marker(pytest.mark.qnn) - - if item.module.__name__ in ["test_export", "test_compile_and_execute"]: - if hasattr(item, "callspec"): - params = item.callspec.params - if params["model_name"] != first_model: - item.add_marker(pytest.mark.skip(reason="Skipping because not needed now...")) - if params["prompt_len"] == 2: - item.add_marker(pytest.mark.skip(reason="Skipping because not needed now...")) - - if item.module.__name__ in ["test_infer"]: - if hasattr(item, "callspec"): - params = item.callspec.params - if params["prompt_len"] == 2 and params["model_name"] != first_model: - item.add_marker(pytest.mark.skip(reason="Skipping because not needed now...")) - - def qeff_models_clean_up(): if os.path.exists(QEFF_MODELS_DIR): shutil.rmtree(QEFF_MODELS_DIR) diff --git a/tests/cloud/high_level_testing.json b/tests/cloud/high_level_testing.json deleted file mode 100644 index d30382dc6..000000000 --- a/tests/cloud/high_level_testing.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "license": "SEE LICENSE IN LICENSE FILE", - "model_name" : ["gpt2"], - "num_cores" : [16], - "prompt" : ["My name is"], - "prompts_txt_file_path" : ["examples/prompts.txt"], - "aic_enable_depth_first" : [1], - "mos" : [1], - "cache_dir" : [null], - "hf_token" : [null], - "batch_size" : [1], - "prompt_len" : [32], - "ctx_len" : [128], - "mxfp6" : [1], - "mxint8" : [1], - "device_group" : [null], - "full_batch_size" : [null,3], - "enable_qnn" : [false, true], - "qnn_config" : [null, "QEfficient/compile/qnn_config.json"] -} diff --git a/tests/cloud/test_compile_and_execute.py b/tests/cloud/test_compile_and_execute.py deleted file mode 100644 index 341d63bb7..000000000 --- a/tests/cloud/test_compile_and_execute.py +++ /dev/null @@ -1,80 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import os - -import pytest -import yaml - -import QEfficient -from QEfficient.cloud.execute import main as execute -from QEfficient.cloud.export import get_onnx_model_path - - -@pytest.mark.on_qaic -@pytest.mark.cli -def test_compile(setup, mocker): - """ - test_compile is a HL compile api testing function, - checks compile api code flow, object creations, internal api calls, internal returns. - --------- - Parameters: - setup: is a fixture defined in conftest.py module. - mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. - """ - ms = setup - onnx_model_path = get_onnx_model_path( - model_name=ms.model_name, - cache_dir=ms.cache_dir, - hf_token=ms.hf_token, - full_batch_size=ms.full_batch_size, - local_model_dir=ms.local_model_dir, - ) - - base_key = "past_key." - base_value = "past_value." - precision = "float16" - - data = [] - - for i in range(12): - data.append({"IOName": f"{base_key}{i}", "Precision": precision}) - data.append({"IOName": f"{base_value}{i}", "Precision": precision}) - - for i in range(12): - data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision}) - data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision}) - - with open(((onnx_model_path.parent) / "custom_io.yaml"), "w") as file: - yaml.dump(data, file) - - qpc_path = QEfficient.compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname(ms.qpc_dir_path()), - num_cores=ms.num_cores, - device_group=ms.device_group, - custom_io_file_path=(onnx_model_path.parent) / "custom_io.yaml", - aic_enable_depth_first=ms.aic_enable_depth_first, - mos=ms.mos, - batch_size=ms.batch_size, - prompt_len=ms.prompt_len, - ctx_len=ms.ctx_len, - mxfp6=ms.mxfp6, - mxint8=ms.mxint8, - full_batch_size=ms.full_batch_size, - enable_qnn=ms.enable_qnn, - ) - - execute( - model_name=ms.model_name, - qpc_path=qpc_path, - prompt=ms.prompt, - prompts_txt_file_path=ms.prompts_txt_file_path, - generation_len=ms.generation_len, - hf_token=ms.hf_token, - full_batch_size=ms.full_batch_size, - ) diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py deleted file mode 100644 index df5b12f5e..000000000 --- a/tests/cloud/test_export.py +++ /dev/null @@ -1,31 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - - -import pytest - -from QEfficient.cloud.export import main as export - - -@pytest.mark.cli -def test_export(setup, mocker): - """ - test_export is a HL export api testing function, - checks export api code flow, object creations, internal api calls, internal returns. - --------- - Parameters: - setup: is a fixture defined in conftest.py module. - mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. - """ - ms = setup - - export( - model_name=ms.model_name, - hf_token=ms.hf_token, - local_model_dir=ms.local_model_dir, - full_batch_size=ms.full_batch_size, - ) diff --git a/tests/cloud/test_export_compile_execute.py b/tests/cloud/test_export_compile_execute.py new file mode 100644 index 000000000..112b2cd96 --- /dev/null +++ b/tests/cloud/test_export_compile_execute.py @@ -0,0 +1,169 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + + +import os + +import pytest +import yaml +from conftest import ModelSetup + +import QEfficient +from QEfficient.cloud.execute import main as execute +from QEfficient.cloud.export import main as export + +configs = [ + { + "model_name": "gpt2", + "num_cores": 16, + "prompt": "My name is", + "prompts_txt_file_path": "examples/prompts.txt", + "aic_enable_depth_first": 1, + "mos": 1, + "cache_dir": None, + "hf_token": None, + "batch_size": 1, + "prompt_len": 32, + "ctx_len": 128, + "mxfp6": 1, + "mxint8": 1, + "device_group": None, + "full_batch_size": 3, + "enable_qnn": True, + "qnn_config": "QEfficient/compile/qnn_config.json", + "image_url": "https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg", + } +] + + +def check_export_compile_execute( + mocker, + **kwargs, +): + # Setup model + model_setup = ModelSetup( + kwargs["model_name"], + kwargs["num_cores"], + kwargs["prompt"], + kwargs["prompts_txt_file_path"], + bool(kwargs["aic_enable_depth_first"]), + kwargs["mos"], + kwargs["cache_dir"], + kwargs["hf_token"], + kwargs["batch_size"], + kwargs["prompt_len"], + kwargs["ctx_len"], + bool(kwargs["mxfp6"]), + bool(kwargs["mxint8"]), + kwargs["full_batch_size"], + kwargs["device_group"], + kwargs["enable_qnn"], + kwargs["qnn_config"], + ) + + # Spy on internal functions + mocker.spy(QEfficient.utils, "check_and_assign_cache_dir") + mock_get_onnx = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path") + + # Export model + export( + model_name=model_setup.model_name, + hf_token=model_setup.hf_token, + local_model_dir=model_setup.local_model_dir, + full_batch_size=model_setup.full_batch_size, + ) + + onnx_model_path = mock_get_onnx.spy_return + print(f"Captured ONNX path: {onnx_model_path}") + + base_key = "past_key." + base_value = "past_value." + precision = "float16" + + data = [] + + for i in range(12): + data.append({"IOName": f"{base_key}{i}", "Precision": precision}) + data.append({"IOName": f"{base_value}{i}", "Precision": precision}) + + for i in range(12): + data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision}) + data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision}) + + with open(((onnx_model_path.parent) / "custom_io.yaml"), "w") as file: + yaml.dump(data, file) + + # Compile model + qpc_path = QEfficient.compile( + onnx_path=onnx_model_path, + qpc_path=os.path.dirname(model_setup.qpc_dir_path()), + num_cores=model_setup.num_cores, + device_group=model_setup.device_group, + custom_io_file_path=(onnx_model_path.parent) / "custom_io.yaml", + aic_enable_depth_first=model_setup.aic_enable_depth_first, + mos=model_setup.mos, + batch_size=model_setup.batch_size, + prompt_len=model_setup.prompt_len, + ctx_len=model_setup.ctx_len, + mxfp6=model_setup.mxfp6, + mxint8=model_setup.mxint8, + full_batch_size=model_setup.full_batch_size, + enable_qnn=model_setup.enable_qnn, + ) + + # Execute model + execute( + model_name=model_setup.model_name, + qpc_path=qpc_path, + prompt=model_setup.prompt, + prompts_txt_file_path=model_setup.prompts_txt_file_path, + generation_len=model_setup.generation_len, + hf_token=model_setup.hf_token, + full_batch_size=model_setup.full_batch_size, + ) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.parametrize("config", configs) +def test_export_compile_execute(mocker, config): + # testing export -> compile -> infer without full_batch_size + + local_config = config.copy() + local_config.update(full_batch_size=None, enable_qnn=False, qnn_config=None) + check_export_compile_execute(mocker=mocker, **local_config) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.parametrize("config", configs) +def test_export_compile_execute_fb(mocker, config): + # testing export -> compile -> infer with full_batch_size + local_config = config.copy() + local_config.update(enable_qnn=False, qnn_config=None) + check_export_compile_execute(mocker=mocker, **local_config) + + +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.cli +@pytest.mark.parametrize("config", configs) +def test_export_compile_execute_qnn(mocker, config): + # testing export -> compile -> infer without full_batch_size in QNN enviroment + local_config = config.copy() + local_config.update(full_batch_size=None, enable_qnn=False, qnn_config=None) + check_export_compile_execute(mocker=mocker, **local_config) + + +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.cli +@pytest.mark.parametrize("config", configs) +def test_export_compile_execute_qnn_fb(mocker, config): + # testing export -> compile -> infer with full_batch_size in QNN enviroment + local_config = config.copy() + check_export_compile_execute(mocker=mocker, **local_config) diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 396d9609d..6d183a5d9 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -5,28 +5,56 @@ # # ----------------------------------------------------------------------------- - import pytest +from conftest import ModelSetup from QEfficient.cloud.infer import main as infer +configs = [ + { + "model_name": "gpt2", + "num_cores": 16, + "prompt": "My name is", + "prompts_txt_file_path": "examples/prompts.txt", + "aic_enable_depth_first": 1, + "mos": 1, + "cache_dir": None, + "hf_token": None, + "batch_size": 1, + "prompt_len": 32, + "ctx_len": 128, + "mxfp6": 1, + "mxint8": 1, + "device_group": None, + "full_batch_size": 3, + "enable_qnn": True, + "qnn_config": "QEfficient/compile/qnn_config.json", + "image_url": "https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg", + } +] + + +def check_infer(mocker, generation_len=32, **kwargs): + ms = ModelSetup( + kwargs["model_name"], + kwargs["num_cores"], + kwargs["prompt"], + kwargs["prompts_txt_file_path"], + bool(kwargs["aic_enable_depth_first"]), + kwargs["mos"], + kwargs["cache_dir"], + kwargs["hf_token"], + kwargs["batch_size"], + kwargs["prompt_len"], + kwargs["ctx_len"], + bool(kwargs["mxfp6"]), + bool(kwargs["mxint8"]), + kwargs["full_batch_size"], + kwargs["device_group"], + kwargs["enable_qnn"], + kwargs["qnn_config"], + ) -@pytest.mark.on_qaic -@pytest.mark.cli -@pytest.mark.usefixtures("clean_up_after_test") -def test_infer(setup, mocker): - """ - test_infer is a HL infer api testing function, - checks infer api code flow, object creations, internal api calls, internal returns. - --------- - Parameters: - setup: is a fixture defined in conftest.py module. - mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. - --------- - Ref: https://docs.pytest.org/en/7.1.x/how-to/fixtures.html - Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html - """ - ms = setup infer( model_name=ms.model_name, num_cores=ms.num_cores, @@ -39,9 +67,91 @@ def test_infer(setup, mocker): batch_size=ms.batch_size, prompt_len=ms.prompt_len, ctx_len=ms.ctx_len, - generation_len=ms.generation_len, + generation_len=generation_len, mxfp6=ms.mxfp6, mxint8=ms.mxint8, full_batch_size=ms.full_batch_size, enable_qnn=ms.enable_qnn, + qnn_config=ms.qnn_config, + image_url=kwargs["image_url"], + ) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.usefixtures("clean_up_after_test") +@pytest.mark.parametrize("config", configs) +def test_infer(mocker, config): + """ + test_infer is a HL infer api testing function, + checks infer api code flow, object creations, internal api calls, internal returns. + --------- + Parameters: + setup: is a fixture defined in conftest.py module. + mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. + --------- + Ref: https://docs.pytest.org/en/7.1.x/how-to/fixtures.html + Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html + """ + # testing infer without full_batch_size + local_config = config.copy() + local_config.update(full_batch_size=None, enable_qnn=False, qnn_config=None) + check_infer(mocker=mocker, **local_config) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.usefixtures("clean_up_after_test") +@pytest.mark.parametrize("config", configs) +def test_infer_fb(mocker, config): + # testing infer with full_batch_size + local_config = config.copy() + local_config.update(enable_qnn=False, qnn_config=None) + check_infer(mocker=mocker, **local_config) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.qnn +@pytest.mark.usefixtures("clean_up_after_test") +@pytest.mark.parametrize("config", configs) +def test_infer_qnn(mocker, config): + # testing infer without full_batch_size in QNN enviroment + local_config = config.copy() + local_config.update( + full_batch_size=None, + ) + check_infer(mocker=mocker, **local_config) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.qnn +@pytest.mark.usefixtures("clean_up_after_test") +@pytest.mark.parametrize("config", configs) +def test_infer_qnn_fb(mocker, config): + # testing infer with full_batch_size in QNN enviroment + local_config = config.copy() + check_infer(mocker=mocker, **local_config) + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.multimodal +@pytest.mark.usefixtures("clean_up_after_test") +@pytest.mark.parametrize("config", configs) +def test_infer_vlm(mocker, config): + # testing infer for MM models + local_config = config.copy() + local_config.update( + { + "model_name": "llava-hf/llava-1.5-7b-hf", + "prompt": "Describe the image.", + "prompt_len": 1024, + "ctx_len": 2048, + "full_batch_size": None, + "enable_qnn": False, + "qnn_config": None, + } ) + check_infer(mocker=mocker, generation_len=20, **local_config) diff --git a/tests/cloud/test_infer_vlm.py b/tests/cloud/test_infer_vlm.py deleted file mode 100644 index 94adb3f36..000000000 --- a/tests/cloud/test_infer_vlm.py +++ /dev/null @@ -1,41 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import pytest - -from QEfficient.cloud.infer import main as infer - - -@pytest.mark.on_qaic -@pytest.mark.cli -@pytest.mark.multimodal -@pytest.mark.usefixtures("clean_up_after_test") -def test_vlm_cli(setup, mocker): - ms = setup - # Taking some values from setup fixture and assigning other's based on model's requirement. - # For example, mxint8 is not required for VLM models, so assigning False. - infer( - model_name="llava-hf/llava-1.5-7b-hf", - num_cores=ms.num_cores, - prompt="Describe the image.", - prompts_txt_file_path=None, - aic_enable_depth_first=ms.aic_enable_depth_first, - mos=ms.mos, - batch_size=1, - full_batch_size=None, - prompt_len=1024, - ctx_len=2048, - generation_len=20, - mxfp6=False, - mxint8=False, - local_model_dir=None, - cache_dir=None, - hf_token=ms.hf_token, - enable_qnn=False, - qnn_config=None, - image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg", - ) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 7099ab604..98cba4143 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -5,13 +5,15 @@ # # ----------------------------------------------------------------------------- +import copy import os -from typing import Optional +from typing import List, Optional import numpy as np import pytest import torch -from transformers import AutoModelForCausalLM +import torch.nn as nn +from transformers import AutoConfig, AutoModelForCausalLM from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM @@ -49,17 +51,178 @@ "hpcai-tech/grok-1", ] +test_dummy_model_configs = [ + # model_name, model_type, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params + ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "llama", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}), + ("gpt2", "gpt2", 128, 1, 2, 64, 256, 50257, {"num_key_value_heads": 1}), + ( + "Salesforce/codegen-350M-mono", + "codegen", + 128, + 1, + 4, + 64, + 256, + 51200, + {"num_key_value_heads": 1, "rotary_dim": 16}, + ), + # ("microsoft/Phi-3-mini-4k-instruct","phi3", 128, 1, 2, 64, 256, 32064, {}), ouput not matching + ("tiiuae/falcon-7b", "falcon", 128, 1, 2, 64, 256, 65024, {"num_key_value_heads": 1}), + ("Qwen/Qwen2-0.5B", "qwen2", 128, 1, 2, 64, 256, 151936, {"num_key_value_heads": 1}), + ("bigcode/starcoder2-3b", "starcoder2", 128, 1, 2, 64, 256, 49152, {"num_key_value_heads": 1}), + ("Felladrin/Minueza-32M-Base", "mistral", 128, 1, 2, 64, 256, 32002, {"num_key_value_heads": 1}), + ("wtang06/mpt-125m-c4", "mpt", 128, 1, 2, 64, 256, 50368, {}), + ("hakurei/gpt-j-random-tinier", "gptj", 128, 1, 2, 64, 256, 50400, {"num_key_value_heads": 1, "rotary_dim": 16}), + ("mistralai/Mixtral-8x7B-Instruct-v0.1", "mixtral", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}), + ( + "meta-llama/Llama-3.2-1B", + "llama", + 128, + 1, + 2, + 64, + 256, + 128256, + { + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3", + }, + }, + ), + ( + "unsloth/gemma-2b", + "gemma", + 128, + 1, + 2, + 64, + 256, + 256000, + {"num_key_value_heads": 1, "_name_or_path": "unsloth/gemma-2b"}, + ), + ( + "unsloth/gemma-2-2b", + "gemma2", + 128, + 1, + 2, + 64, + 256, + 256000, + {"num_key_value_heads": 1, "_name_or_path": "unsloth/gemma-2-2b"}, + ), + # ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", "llama", 128, 1, 2, 64, 256, 32003, {"num_key_value_heads": 1, "architectures": ["LlamaForCausalLM"], "pad_token_id": 0}), + # ("TheBloke/Llama-2-7B-GPTQ", "llama", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 2}), + ( + "ibm-granite/granite-20b-code-base", + "gpt_bigcode", + 128, + 1, + 2, + 64, + 256, + 49152, + {"num_key_value_heads": 1, "activation_function": "gelu", "architectures": ["GPTBigCodeForCausalLM"]}, + ), + # ("neuralmagic/Llama-3.2-3B-Instruct-FP8", "llama", 128, 1, 2, 64, 256, 128256, {"num_key_value_heads": 2}), + # ("neuralmagic/Qwen2-0.5B-Instruct-FP8", "qwen2", 128, 1, 2, 64, 256, 151936, {"num_key_value_heads": 1,"quantization_config": {"activation_scheme": "static","ignored_layers": [ "lm_head" ],"quant_method": "fp8"}}), + # ("ibm-granite/granite-3.1-2b-instruct", "granite", 128, 1, 2, 64, 256, 49155, {"num_key_value_heads": 2}), + ("ibm-granite/granite-guardian-3.1-2b", "granite", 128, 1, 2, 64, 256, 49155, {"num_key_value_heads": 1}), +] + + +def get_model_configs_and_names(configs: List[tuple]): + configs = [ + ( + AutoConfig.for_model( + model_type, + max_position_embeddings=max_position_embeddings, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + vocab_size=vocab_size, + **additional_params, + ), + model_name, + ) + for ( + model_name, + model_type, + max_position_embeddings, + num_hidden_layers, + num_attention_heads, + hidden_size, + intermediate_size, + vocab_size, + additional_params, + ) in configs + ] + names = [y for (_, y) in configs] + return configs, names + + +test_dummy_model_configs, test_dummy_model_names = get_model_configs_and_names(test_dummy_model_configs) + test_models_qnn = [ "mistralai/Mixtral-8x7B-Instruct-v0.1", "meta-llama/Llama-3.2-1B", "unsloth/gemma-2b", "ibm-granite/granite-guardian-3.1-2b", ] +test_dummy_model_configs_qnn = [ + # model_name, model_type, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params + ("mistralai/Mixtral-8x7B-Instruct-v0.1", "mixtral", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}), + ( + "meta-llama/Llama-3.2-1B", + "llama", + 128, + 1, + 2, + 64, + 256, + 128256, + { + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3", + }, + }, + ), + ( + "unsloth/gemma-2b", + "gemma", + 128, + 1, + 2, + 64, + 256, + 256000, + {"num_key_value_heads": 1, "_name_or_path": "unsloth/gemma-2b"}, + ), + ("ibm-granite/granite-guardian-3.1-2b", "granite", 128, 1, 2, 64, 256, 49155, {"num_key_value_heads": 1}), +] +test_dummy_model_configs_qnn, test_dummy_model_names_qnn = get_model_configs_and_names(test_dummy_model_configs_qnn) spd_test_models = [ "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "Qwen/Qwen2-0.5B", ] +test_dummy_model_configs_spd = [ + # model_name, model_type, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params + ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "llama", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}), + ("Qwen/Qwen2-0.5B", "qwen2", 128, 1, 2, 64, 256, 151936, {"num_key_value_heads": 1}), +] +test_dummy_model_configs_spd, test_dummy_model_names_spd = get_model_configs_and_names(test_dummy_model_configs_spd) def load_causal_lm_model(model_config): @@ -101,6 +264,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( prefill_only: Optional[bool] = None, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + model_hf: Optional[nn.Module] = None, ): """ Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. @@ -113,8 +277,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( replace_transformers_quantizers() model_config = {"model_name": model_name} model_config["n_layer"] = n_layer - - model_hf, _ = load_causal_lm_model(model_config) + if model_hf is None: + model_hf, _ = load_causal_lm_model(model_config) + model_hf_cb = copy.deepcopy(model_hf) tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) config = model_hf.config @@ -129,16 +294,13 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( ) pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) - is_tlm = False if num_speculative_tokens is None else True qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm, pretrained_model_name_or_path=model_name) - pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( "Tokens don't match for HF PyTorch model output and KV PyTorch model output" ) - onnx_model_path = qeff_model.export() ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm) gen_len = ort_tokens.shape[-1] @@ -147,7 +309,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") - qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, @@ -174,8 +335,10 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) if prefill_only is not None: return + # testing for CB models - model_hf, _ = load_causal_lm_model(model_config) + model_hf = model_hf_cb + model_hf.eval() full_batch_size = 4 fbs_prompts = Constants.INPUT_STR * 4 api_runner = ApiRunner( @@ -252,6 +415,28 @@ def test_causal_lm_export_with_deprecated_api(model_name): ) +@pytest.mark.on_qaic +@pytest.mark.regular +@pytest.mark.parametrize( + "test_dummy_model_config, test_dummy_model_name", test_dummy_model_configs, ids=test_dummy_model_names +) +def test_dummy_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(test_dummy_model_config, test_dummy_model_name): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + + torch.manual_seed(42) + model_hf = AutoModelForCausalLM.from_config( + test_dummy_model_config, + attn_implementation="eager", + ) + model_hf.eval() + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(test_dummy_model_name, model_hf=model_hf) + + +@pytest.mark.nightly @pytest.mark.on_qaic @pytest.mark.parametrize("model_name", test_models_qaic) def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): @@ -268,6 +453,36 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) +@pytest.mark.on_qaic +@pytest.mark.regular +@pytest.mark.qnn +@pytest.mark.parametrize( + "test_dummy_model_config_qnn, test_dummy_model_name_qnn", + test_dummy_model_configs_qnn, + ids=test_dummy_model_names_qnn, +) +def test_dummy_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(test_dummy_model_config_qnn, test_dummy_model_name_qnn): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + + torch.manual_seed(42) + model_hf = AutoModelForCausalLM.from_config( + test_dummy_model_config_qnn, + attn_implementation="eager", + ) + model_hf.eval() + qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") + create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + test_dummy_model_name_qnn, enable_qnn=True, qnn_config=qnn_config_json_path, model_hf=model_hf + ) + + +@pytest.mark.nightly @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.parametrize("model_name", test_models_qnn) @@ -292,6 +507,35 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): @pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model +@pytest.mark.regular +@pytest.mark.on_qaic +@pytest.mark.qnn +@pytest.mark.parametrize( + "test_dummy_model_config_spd, test_dummy_model_name_spd", + test_dummy_model_configs_spd, + ids=test_dummy_model_names_spd, +) +def test_dummy_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(test_dummy_model_config_spd, test_dummy_model_name_spd): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + + torch.manual_seed(42) + model_hf = AutoModelForCausalLM.from_config( + test_dummy_model_config_spd, + attn_implementation="eager", + ) + model_hf.eval() + + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=test_dummy_model_name_spd, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS, model_hf=model_hf + ) + + +@pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model +@pytest.mark.nightly @pytest.mark.on_qaic @pytest.mark.parametrize("model_name", spd_test_models) def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): @@ -316,7 +560,7 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1(): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching. """ - model_name = "gpt2" + model_name = "gpt2" # hf-internal-testing/tiny-random-gpt2 prompt_len = 1 check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)