diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index e6a69d5fb..f560904f1 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -58,7 +58,7 @@ pipeline {
                            mkdir -p $PWD/Non_qaic &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/Non_qaic &&
-                           pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
+                           pytest tests -m '(not cli) and (on_qaic) and (not nightly) and (not multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
                            junitparser merge tests/tests_log2.xml tests/tests_log.xml &&
                            deactivate"
                            '''
@@ -144,7 +144,7 @@ pipeline {
                     mkdir -p $PWD/Qnn_non_cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_non_cli &&
-                    pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
+                    pytest tests -m '(not cli) and (qnn) and (not nightly) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
                     junitparser merge tests/tests_log5.xml tests/tests_log.xml &&
                     deactivate"
                     '''
diff --git a/tests/cloud/conftest.py b/tests/cloud/conftest.py
index 8b17297ac..a130bbdbe 100644
--- a/tests/cloud/conftest.py
+++ b/tests/cloud/conftest.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import json
 import os
 import shutil
 
@@ -145,165 +144,6 @@ def custom_io_file_path(self):
             return str(os.path.join(self.onnx_dir_path(), "custom_io_fp16.yaml"))
 
 
-@pytest.fixture(scope="function")
-def setup(
-    model_name,
-    num_cores,
-    prompt,
-    prompts_txt_file_path,
-    aic_enable_depth_first,
-    mos,
-    cache_dir,
-    hf_token,
-    batch_size,
-    prompt_len,
-    ctx_len,
-    mxfp6,
-    mxint8,
-    full_batch_size,
-    device_group,
-    enable_qnn,
-    qnn_config,
-):
-    """
-    It is a fixture or shared object of all testing script within or inner folder,
-    Args are coming from the dynamically generated tests method i.e, pytest_generate_tests via testing script or method
-    --------
-    Args: same as set up initialization
-    Return: model_setup class object
-    """
-    model_setup = ModelSetup(
-        model_name,
-        num_cores,
-        prompt,
-        prompts_txt_file_path,
-        bool(aic_enable_depth_first),
-        mos,
-        cache_dir,
-        hf_token,
-        batch_size,
-        prompt_len,
-        ctx_len,
-        bool(mxfp6),
-        bool(mxint8),
-        full_batch_size,
-        device_group,
-        enable_qnn,
-        qnn_config,
-    )
-
-    yield model_setup
-    del model_setup
-
-
-def pytest_generate_tests(metafunc):
-    """
-    pytest_generate_tests hook is used to create our own input parametrization,
-    It generates all the test cases of different combination of input parameters which are read from the json file,
-    and passed to each testing script module.
-    -----------
-    Ref: https://docs.pytest.org/en/7.3.x/how-to/parametrize.html
-    """
-    json_file = "tests/cloud/high_level_testing.json"
-    with open(json_file, "r") as file:
-        json_data = json.load(file)
-
-    metafunc.parametrize("model_name", json_data["model_name"], ids=lambda x: "model_name=" + str(x))
-    metafunc.parametrize("num_cores", json_data["num_cores"], ids=lambda x: "num_cores=" + str(x))
-    metafunc.parametrize("prompt", json_data["prompt"], ids=lambda x: "prompt=" + str(x))
-    metafunc.parametrize(
-        "prompts_txt_file_path", json_data["prompts_txt_file_path"], ids=lambda x: "prompts_txt_file_path=" + str(x)
-    )
-    metafunc.parametrize(
-        "aic_enable_depth_first", json_data["aic_enable_depth_first"], ids=lambda x: "aic_enable_depth_first=" + str(x)
-    )
-    metafunc.parametrize("mos", json_data["mos"], ids=lambda x: "mos=" + str(x))
-    metafunc.parametrize("cache_dir", [None], ids=lambda x: "cache_dir=" + str(x))
-    metafunc.parametrize("hf_token", json_data["hf_token"], ids=lambda x: "hf_token=" + str(x))
-    metafunc.parametrize("batch_size", json_data["batch_size"], ids=lambda x: "batch_size=" + str(x))
-    metafunc.parametrize("prompt_len", json_data["prompt_len"], ids=lambda x: "prompt_len=" + str(x))
-    metafunc.parametrize("ctx_len", json_data["ctx_len"], ids=lambda x: "ctx_len=" + str(x))
-    metafunc.parametrize("mxfp6", json_data["mxfp6"], ids=lambda x: "mxfp6=" + str(x))
-    metafunc.parametrize("mxint8", json_data["mxint8"], ids=lambda x: "mxint8=" + str(x))
-    metafunc.parametrize("full_batch_size", json_data["full_batch_size"], ids=lambda x: "full_batch_size=" + str(x))
-    metafunc.parametrize("device_group", json_data["device_group"], ids=lambda x: "device_group=" + str(x))
-    metafunc.parametrize("enable_qnn", json_data["enable_qnn"], ids=lambda x: "enable_qnn=" + str(x))
-    metafunc.parametrize("qnn_config", json_data["qnn_config"], ids=lambda x: "qnn_config=" + str(x))
-
-
-def pytest_collection_modifyitems(config, items):
-    """
-    pytest_collection_modifyitems is pytest a hook,
-    which is used to re-order the execution order of the testing script/methods
-    with various combination of inputs.
-    called after collection has been performed, may filter or re-order the items in-place.
-    Parameters:
-    items (List[_pytest.nodes.Item]) list of item objects
-    ----------
-    Ref: https://docs.pytest.org/en/4.6.x/reference.html#collection-hooks
-    """
-    run_first = ["test_export", "test_infer"]
-    modules_name = {item.module.__name__ for item in items}
-    cloud_modules = []
-    non_cloud_modules = []
-    for module in modules_name:
-        if module in run_first:
-            cloud_modules.append(module)
-        else:
-            non_cloud_modules.append(module)
-
-    if len(cloud_modules) > 1:
-        modules = {item: item.module.__name__ for item in items}
-        items[:] = sorted(items, key=lambda x: run_first.index(modules[x]) if modules[x] in run_first else len(items))
-
-        non_cloud_tests = []
-
-        for itm in items:
-            if modules[itm] not in cloud_modules:
-                non_cloud_tests.append(itm)
-
-        num_cloud_tests = len(items) - len(non_cloud_tests)
-        num_cloud_test_cases = num_cloud_tests // len(cloud_modules)
-        final_items = []
-
-        for i in range(num_cloud_test_cases):
-            for j in range(len(cloud_modules)):
-                final_items.append(items[i + j * num_cloud_test_cases])
-
-        final_items.extend(non_cloud_tests)
-        items[:] = final_items
-
-        if config.getoption("--all"):
-            return
-
-        first_model = items[0].callspec.params["model_name"] if hasattr(items[0], "callspec") else None
-
-        for item in items:
-            if item.module.__name__ in ["test_export", "test_compile_and_execute", "test_infer"]:
-                if hasattr(item, "callspec"):
-                    params = item.callspec.params
-                    if not params["enable_qnn"] and params["qnn_config"] is not None:
-                        item.add_marker(
-                            pytest.mark.skip(reason="Skipping because same as enable_qnn = false and qnn_config = None")
-                        )
-                    if params["enable_qnn"]:
-                        item.add_marker(pytest.mark.qnn)
-
-            if item.module.__name__ in ["test_export", "test_compile_and_execute"]:
-                if hasattr(item, "callspec"):
-                    params = item.callspec.params
-                    if params["model_name"] != first_model:
-                        item.add_marker(pytest.mark.skip(reason="Skipping because not needed now..."))
-                    if params["prompt_len"] == 2:
-                        item.add_marker(pytest.mark.skip(reason="Skipping because not needed now..."))
-
-            if item.module.__name__ in ["test_infer"]:
-                if hasattr(item, "callspec"):
-                    params = item.callspec.params
-                    if params["prompt_len"] == 2 and params["model_name"] != first_model:
-                        item.add_marker(pytest.mark.skip(reason="Skipping because not needed now..."))
-
-
 def qeff_models_clean_up():
     if os.path.exists(QEFF_MODELS_DIR):
         shutil.rmtree(QEFF_MODELS_DIR)
diff --git a/tests/cloud/high_level_testing.json b/tests/cloud/high_level_testing.json
deleted file mode 100644
index d30382dc6..000000000
--- a/tests/cloud/high_level_testing.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-    "license": "SEE LICENSE IN LICENSE FILE",
-    "model_name" : ["gpt2"],
-    "num_cores" : [16],
-    "prompt" : ["My name is"],
-    "prompts_txt_file_path" : ["examples/prompts.txt"],
-    "aic_enable_depth_first" : [1],
-    "mos" : [1],
-    "cache_dir" : [null],
-    "hf_token" : [null],
-    "batch_size" : [1],
-    "prompt_len" : [32],
-    "ctx_len" : [128],
-    "mxfp6" : [1],
-    "mxint8" : [1],
-    "device_group" : [null],
-    "full_batch_size" : [null,3],
-    "enable_qnn" : [false, true],
-    "qnn_config" : [null, "QEfficient/compile/qnn_config.json"]
-}
diff --git a/tests/cloud/test_compile_and_execute.py b/tests/cloud/test_compile_and_execute.py
deleted file mode 100644
index 341d63bb7..000000000
--- a/tests/cloud/test_compile_and_execute.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-import os
-
-import pytest
-import yaml
-
-import QEfficient
-from QEfficient.cloud.execute import main as execute
-from QEfficient.cloud.export import get_onnx_model_path
-
-
-@pytest.mark.on_qaic
-@pytest.mark.cli
-def test_compile(setup, mocker):
-    """
-    test_compile is a HL compile api testing function,
-    checks compile api code flow, object creations, internal api calls, internal returns.
-    ---------
-    Parameters:
-    setup: is a fixture defined in conftest.py module.
-    mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
-    """
-    ms = setup
-    onnx_model_path = get_onnx_model_path(
-        model_name=ms.model_name,
-        cache_dir=ms.cache_dir,
-        hf_token=ms.hf_token,
-        full_batch_size=ms.full_batch_size,
-        local_model_dir=ms.local_model_dir,
-    )
-
-    base_key = "past_key."
-    base_value = "past_value."
-    precision = "float16"
-
-    data = []
-
-    for i in range(12):
-        data.append({"IOName": f"{base_key}{i}", "Precision": precision})
-        data.append({"IOName": f"{base_value}{i}", "Precision": precision})
-
-    for i in range(12):
-        data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision})
-        data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision})
-
-    with open(((onnx_model_path.parent) / "custom_io.yaml"), "w") as file:
-        yaml.dump(data, file)
-
-    qpc_path = QEfficient.compile(
-        onnx_path=onnx_model_path,
-        qpc_path=os.path.dirname(ms.qpc_dir_path()),
-        num_cores=ms.num_cores,
-        device_group=ms.device_group,
-        custom_io_file_path=(onnx_model_path.parent) / "custom_io.yaml",
-        aic_enable_depth_first=ms.aic_enable_depth_first,
-        mos=ms.mos,
-        batch_size=ms.batch_size,
-        prompt_len=ms.prompt_len,
-        ctx_len=ms.ctx_len,
-        mxfp6=ms.mxfp6,
-        mxint8=ms.mxint8,
-        full_batch_size=ms.full_batch_size,
-        enable_qnn=ms.enable_qnn,
-    )
-
-    execute(
-        model_name=ms.model_name,
-        qpc_path=qpc_path,
-        prompt=ms.prompt,
-        prompts_txt_file_path=ms.prompts_txt_file_path,
-        generation_len=ms.generation_len,
-        hf_token=ms.hf_token,
-        full_batch_size=ms.full_batch_size,
-    )
diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py
deleted file mode 100644
index df5b12f5e..000000000
--- a/tests/cloud/test_export.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-
-import pytest
-
-from QEfficient.cloud.export import main as export
-
-
-@pytest.mark.cli
-def test_export(setup, mocker):
-    """
-    test_export is a HL export api testing function,
-    checks export api code flow, object creations, internal api calls, internal returns.
-    ---------
-    Parameters:
-    setup: is a fixture defined in conftest.py module.
-    mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
-    """
-    ms = setup
-
-    export(
-        model_name=ms.model_name,
-        hf_token=ms.hf_token,
-        local_model_dir=ms.local_model_dir,
-        full_batch_size=ms.full_batch_size,
-    )
diff --git a/tests/cloud/test_export_compile_execute.py b/tests/cloud/test_export_compile_execute.py
new file mode 100644
index 000000000..112b2cd96
--- /dev/null
+++ b/tests/cloud/test_export_compile_execute.py
@@ -0,0 +1,169 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+
+import os
+
+import pytest
+import yaml
+from conftest import ModelSetup
+
+import QEfficient
+from QEfficient.cloud.execute import main as execute
+from QEfficient.cloud.export import main as export
+
+configs = [
+    {
+        "model_name": "gpt2",
+        "num_cores": 16,
+        "prompt": "My name is",
+        "prompts_txt_file_path": "examples/prompts.txt",
+        "aic_enable_depth_first": 1,
+        "mos": 1,
+        "cache_dir": None,
+        "hf_token": None,
+        "batch_size": 1,
+        "prompt_len": 32,
+        "ctx_len": 128,
+        "mxfp6": 1,
+        "mxint8": 1,
+        "device_group": None,
+        "full_batch_size": 3,
+        "enable_qnn": True,
+        "qnn_config": "QEfficient/compile/qnn_config.json",
+        "image_url": "https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg",
+    }
+]
+
+
+def check_export_compile_execute(
+    mocker,
+    **kwargs,
+):
+    # Setup model
+    model_setup = ModelSetup(
+        kwargs["model_name"],
+        kwargs["num_cores"],
+        kwargs["prompt"],
+        kwargs["prompts_txt_file_path"],
+        bool(kwargs["aic_enable_depth_first"]),
+        kwargs["mos"],
+        kwargs["cache_dir"],
+        kwargs["hf_token"],
+        kwargs["batch_size"],
+        kwargs["prompt_len"],
+        kwargs["ctx_len"],
+        bool(kwargs["mxfp6"]),
+        bool(kwargs["mxint8"]),
+        kwargs["full_batch_size"],
+        kwargs["device_group"],
+        kwargs["enable_qnn"],
+        kwargs["qnn_config"],
+    )
+
+    # Spy on internal functions
+    mocker.spy(QEfficient.utils, "check_and_assign_cache_dir")
+    mock_get_onnx = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path")
+
+    # Export model
+    export(
+        model_name=model_setup.model_name,
+        hf_token=model_setup.hf_token,
+        local_model_dir=model_setup.local_model_dir,
+        full_batch_size=model_setup.full_batch_size,
+    )
+
+    onnx_model_path = mock_get_onnx.spy_return
+    print(f"Captured ONNX path: {onnx_model_path}")
+
+    base_key = "past_key."
+    base_value = "past_value."
+    precision = "float16"
+
+    data = []
+
+    for i in range(12):
+        data.append({"IOName": f"{base_key}{i}", "Precision": precision})
+        data.append({"IOName": f"{base_value}{i}", "Precision": precision})
+
+    for i in range(12):
+        data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision})
+        data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision})
+
+    with open(((onnx_model_path.parent) / "custom_io.yaml"), "w") as file:
+        yaml.dump(data, file)
+
+    # Compile model
+    qpc_path = QEfficient.compile(
+        onnx_path=onnx_model_path,
+        qpc_path=os.path.dirname(model_setup.qpc_dir_path()),
+        num_cores=model_setup.num_cores,
+        device_group=model_setup.device_group,
+        custom_io_file_path=(onnx_model_path.parent) / "custom_io.yaml",
+        aic_enable_depth_first=model_setup.aic_enable_depth_first,
+        mos=model_setup.mos,
+        batch_size=model_setup.batch_size,
+        prompt_len=model_setup.prompt_len,
+        ctx_len=model_setup.ctx_len,
+        mxfp6=model_setup.mxfp6,
+        mxint8=model_setup.mxint8,
+        full_batch_size=model_setup.full_batch_size,
+        enable_qnn=model_setup.enable_qnn,
+    )
+
+    # Execute model
+    execute(
+        model_name=model_setup.model_name,
+        qpc_path=qpc_path,
+        prompt=model_setup.prompt,
+        prompts_txt_file_path=model_setup.prompts_txt_file_path,
+        generation_len=model_setup.generation_len,
+        hf_token=model_setup.hf_token,
+        full_batch_size=model_setup.full_batch_size,
+    )
+
+
+@pytest.mark.on_qaic
+@pytest.mark.cli
+@pytest.mark.parametrize("config", configs)
+def test_export_compile_execute(mocker, config):
+    # testing export -> compile -> infer without full_batch_size
+
+    local_config = config.copy()
+    local_config.update(full_batch_size=None, enable_qnn=False, qnn_config=None)
+    check_export_compile_execute(mocker=mocker, **local_config)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.cli
+@pytest.mark.parametrize("config", configs)
+def test_export_compile_execute_fb(mocker, config):
+    # testing export -> compile -> infer with full_batch_size
+    local_config = config.copy()
+    local_config.update(enable_qnn=False, qnn_config=None)
+    check_export_compile_execute(mocker=mocker, **local_config)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.cli
+@pytest.mark.parametrize("config", configs)
+def test_export_compile_execute_qnn(mocker, config):
+    # testing export -> compile -> infer without full_batch_size in QNN enviroment
+    local_config = config.copy()
+    local_config.update(full_batch_size=None, enable_qnn=False, qnn_config=None)
+    check_export_compile_execute(mocker=mocker, **local_config)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.cli
+@pytest.mark.parametrize("config", configs)
+def test_export_compile_execute_qnn_fb(mocker, config):
+    # testing export -> compile -> infer with full_batch_size in QNN enviroment
+    local_config = config.copy()
+    check_export_compile_execute(mocker=mocker, **local_config)
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
index 396d9609d..6d183a5d9 100644
--- a/tests/cloud/test_infer.py
+++ b/tests/cloud/test_infer.py
@@ -5,28 +5,56 @@
 #
 # -----------------------------------------------------------------------------
 
-
 import pytest
+from conftest import ModelSetup
 
 from QEfficient.cloud.infer import main as infer
 
+configs = [
+    {
+        "model_name": "gpt2",
+        "num_cores": 16,
+        "prompt": "My name is",
+        "prompts_txt_file_path": "examples/prompts.txt",
+        "aic_enable_depth_first": 1,
+        "mos": 1,
+        "cache_dir": None,
+        "hf_token": None,
+        "batch_size": 1,
+        "prompt_len": 32,
+        "ctx_len": 128,
+        "mxfp6": 1,
+        "mxint8": 1,
+        "device_group": None,
+        "full_batch_size": 3,
+        "enable_qnn": True,
+        "qnn_config": "QEfficient/compile/qnn_config.json",
+        "image_url": "https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg",
+    }
+]
+
+
+def check_infer(mocker, generation_len=32, **kwargs):
+    ms = ModelSetup(
+        kwargs["model_name"],
+        kwargs["num_cores"],
+        kwargs["prompt"],
+        kwargs["prompts_txt_file_path"],
+        bool(kwargs["aic_enable_depth_first"]),
+        kwargs["mos"],
+        kwargs["cache_dir"],
+        kwargs["hf_token"],
+        kwargs["batch_size"],
+        kwargs["prompt_len"],
+        kwargs["ctx_len"],
+        bool(kwargs["mxfp6"]),
+        bool(kwargs["mxint8"]),
+        kwargs["full_batch_size"],
+        kwargs["device_group"],
+        kwargs["enable_qnn"],
+        kwargs["qnn_config"],
+    )
 
-@pytest.mark.on_qaic
-@pytest.mark.cli
-@pytest.mark.usefixtures("clean_up_after_test")
-def test_infer(setup, mocker):
-    """
-    test_infer is a HL infer api testing function,
-    checks infer api code flow, object creations, internal api calls, internal returns.
-    ---------
-    Parameters:
-    setup: is a fixture defined in conftest.py module.
-    mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
-    ---------
-    Ref: https://docs.pytest.org/en/7.1.x/how-to/fixtures.html
-    Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
-    """
-    ms = setup
     infer(
         model_name=ms.model_name,
         num_cores=ms.num_cores,
@@ -39,9 +67,91 @@ def test_infer(setup, mocker):
         batch_size=ms.batch_size,
         prompt_len=ms.prompt_len,
         ctx_len=ms.ctx_len,
-        generation_len=ms.generation_len,
+        generation_len=generation_len,
         mxfp6=ms.mxfp6,
         mxint8=ms.mxint8,
         full_batch_size=ms.full_batch_size,
         enable_qnn=ms.enable_qnn,
+        qnn_config=ms.qnn_config,
+        image_url=kwargs["image_url"],
+    )
+
+
+@pytest.mark.on_qaic
+@pytest.mark.cli
+@pytest.mark.usefixtures("clean_up_after_test")
+@pytest.mark.parametrize("config", configs)
+def test_infer(mocker, config):
+    """
+    test_infer is a HL infer api testing function,
+    checks infer api code flow, object creations, internal api calls, internal returns.
+    ---------
+    Parameters:
+    setup: is a fixture defined in conftest.py module.
+    mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
+    ---------
+    Ref: https://docs.pytest.org/en/7.1.x/how-to/fixtures.html
+    Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
+    """
+    # testing infer without full_batch_size
+    local_config = config.copy()
+    local_config.update(full_batch_size=None, enable_qnn=False, qnn_config=None)
+    check_infer(mocker=mocker, **local_config)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.cli
+@pytest.mark.usefixtures("clean_up_after_test")
+@pytest.mark.parametrize("config", configs)
+def test_infer_fb(mocker, config):
+    # testing infer with full_batch_size
+    local_config = config.copy()
+    local_config.update(enable_qnn=False, qnn_config=None)
+    check_infer(mocker=mocker, **local_config)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.cli
+@pytest.mark.qnn
+@pytest.mark.usefixtures("clean_up_after_test")
+@pytest.mark.parametrize("config", configs)
+def test_infer_qnn(mocker, config):
+    # testing infer without full_batch_size in QNN enviroment
+    local_config = config.copy()
+    local_config.update(
+        full_batch_size=None,
+    )
+    check_infer(mocker=mocker, **local_config)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.cli
+@pytest.mark.qnn
+@pytest.mark.usefixtures("clean_up_after_test")
+@pytest.mark.parametrize("config", configs)
+def test_infer_qnn_fb(mocker, config):
+    # testing infer with full_batch_size in QNN enviroment
+    local_config = config.copy()
+    check_infer(mocker=mocker, **local_config)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.cli
+@pytest.mark.multimodal
+@pytest.mark.usefixtures("clean_up_after_test")
+@pytest.mark.parametrize("config", configs)
+def test_infer_vlm(mocker, config):
+    # testing infer for MM models
+    local_config = config.copy()
+    local_config.update(
+        {
+            "model_name": "llava-hf/llava-1.5-7b-hf",
+            "prompt": "Describe the image.",
+            "prompt_len": 1024,
+            "ctx_len": 2048,
+            "full_batch_size": None,
+            "enable_qnn": False,
+            "qnn_config": None,
+        }
     )
+    check_infer(mocker=mocker, generation_len=20, **local_config)
diff --git a/tests/cloud/test_infer_vlm.py b/tests/cloud/test_infer_vlm.py
deleted file mode 100644
index 94adb3f36..000000000
--- a/tests/cloud/test_infer_vlm.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-import pytest
-
-from QEfficient.cloud.infer import main as infer
-
-
-@pytest.mark.on_qaic
-@pytest.mark.cli
-@pytest.mark.multimodal
-@pytest.mark.usefixtures("clean_up_after_test")
-def test_vlm_cli(setup, mocker):
-    ms = setup
-    # Taking some values from setup fixture and assigning other's based on model's requirement.
-    # For example, mxint8 is not required for VLM models, so assigning False.
-    infer(
-        model_name="llava-hf/llava-1.5-7b-hf",
-        num_cores=ms.num_cores,
-        prompt="Describe the image.",
-        prompts_txt_file_path=None,
-        aic_enable_depth_first=ms.aic_enable_depth_first,
-        mos=ms.mos,
-        batch_size=1,
-        full_batch_size=None,
-        prompt_len=1024,
-        ctx_len=2048,
-        generation_len=20,
-        mxfp6=False,
-        mxint8=False,
-        local_model_dir=None,
-        cache_dir=None,
-        hf_token=ms.hf_token,
-        enable_qnn=False,
-        qnn_config=None,
-        image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg",
-    )
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 7099ab604..98cba4143 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -5,13 +5,15 @@
 #
 # -----------------------------------------------------------------------------
 
+import copy
 import os
-from typing import Optional
+from typing import List, Optional
 
 import numpy as np
 import pytest
 import torch
-from transformers import AutoModelForCausalLM
+import torch.nn as nn
+from transformers import AutoConfig, AutoModelForCausalLM
 
 from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
@@ -49,17 +51,178 @@
     "hpcai-tech/grok-1",
 ]
 
+test_dummy_model_configs = [
+    # model_name, model_type, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params
+    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "llama", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}),
+    ("gpt2", "gpt2", 128, 1, 2, 64, 256, 50257, {"num_key_value_heads": 1}),
+    (
+        "Salesforce/codegen-350M-mono",
+        "codegen",
+        128,
+        1,
+        4,
+        64,
+        256,
+        51200,
+        {"num_key_value_heads": 1, "rotary_dim": 16},
+    ),
+    # ("microsoft/Phi-3-mini-4k-instruct","phi3", 128, 1, 2, 64, 256, 32064, {}), ouput not matching
+    ("tiiuae/falcon-7b", "falcon", 128, 1, 2, 64, 256, 65024, {"num_key_value_heads": 1}),
+    ("Qwen/Qwen2-0.5B", "qwen2", 128, 1, 2, 64, 256, 151936, {"num_key_value_heads": 1}),
+    ("bigcode/starcoder2-3b", "starcoder2", 128, 1, 2, 64, 256, 49152, {"num_key_value_heads": 1}),
+    ("Felladrin/Minueza-32M-Base", "mistral", 128, 1, 2, 64, 256, 32002, {"num_key_value_heads": 1}),
+    ("wtang06/mpt-125m-c4", "mpt", 128, 1, 2, 64, 256, 50368, {}),
+    ("hakurei/gpt-j-random-tinier", "gptj", 128, 1, 2, 64, 256, 50400, {"num_key_value_heads": 1, "rotary_dim": 16}),
+    ("mistralai/Mixtral-8x7B-Instruct-v0.1", "mixtral", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}),
+    (
+        "meta-llama/Llama-3.2-1B",
+        "llama",
+        128,
+        1,
+        2,
+        64,
+        256,
+        128256,
+        {
+            "num_key_value_heads": 1,
+            "rope_scaling": {
+                "factor": 32.0,
+                "high_freq_factor": 4.0,
+                "low_freq_factor": 1.0,
+                "original_max_position_embeddings": 8192,
+                "rope_type": "llama3",
+            },
+        },
+    ),
+    (
+        "unsloth/gemma-2b",
+        "gemma",
+        128,
+        1,
+        2,
+        64,
+        256,
+        256000,
+        {"num_key_value_heads": 1, "_name_or_path": "unsloth/gemma-2b"},
+    ),
+    (
+        "unsloth/gemma-2-2b",
+        "gemma2",
+        128,
+        1,
+        2,
+        64,
+        256,
+        256000,
+        {"num_key_value_heads": 1, "_name_or_path": "unsloth/gemma-2-2b"},
+    ),
+    # ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", "llama",  128, 1, 2, 64, 256, 32003, {"num_key_value_heads": 1, "architectures": ["LlamaForCausalLM"], "pad_token_id": 0}),
+    # ("TheBloke/Llama-2-7B-GPTQ", "llama", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 2}),
+    (
+        "ibm-granite/granite-20b-code-base",
+        "gpt_bigcode",
+        128,
+        1,
+        2,
+        64,
+        256,
+        49152,
+        {"num_key_value_heads": 1, "activation_function": "gelu", "architectures": ["GPTBigCodeForCausalLM"]},
+    ),
+    # ("neuralmagic/Llama-3.2-3B-Instruct-FP8", "llama", 128, 1, 2, 64, 256, 128256, {"num_key_value_heads": 2}),
+    # ("neuralmagic/Qwen2-0.5B-Instruct-FP8", "qwen2", 128, 1, 2, 64, 256, 151936, {"num_key_value_heads": 1,"quantization_config": {"activation_scheme": "static","ignored_layers": [  "lm_head" ],"quant_method": "fp8"}}),
+    # ("ibm-granite/granite-3.1-2b-instruct", "granite", 128, 1, 2, 64, 256, 49155, {"num_key_value_heads": 2}),
+    ("ibm-granite/granite-guardian-3.1-2b", "granite", 128, 1, 2, 64, 256, 49155, {"num_key_value_heads": 1}),
+]
+
+
+def get_model_configs_and_names(configs: List[tuple]):
+    configs = [
+        (
+            AutoConfig.for_model(
+                model_type,
+                max_position_embeddings=max_position_embeddings,
+                num_hidden_layers=num_hidden_layers,
+                num_attention_heads=num_attention_heads,
+                hidden_size=hidden_size,
+                intermediate_size=intermediate_size,
+                vocab_size=vocab_size,
+                **additional_params,
+            ),
+            model_name,
+        )
+        for (
+            model_name,
+            model_type,
+            max_position_embeddings,
+            num_hidden_layers,
+            num_attention_heads,
+            hidden_size,
+            intermediate_size,
+            vocab_size,
+            additional_params,
+        ) in configs
+    ]
+    names = [y for (_, y) in configs]
+    return configs, names
+
+
+test_dummy_model_configs, test_dummy_model_names = get_model_configs_and_names(test_dummy_model_configs)
+
 test_models_qnn = [
     "mistralai/Mixtral-8x7B-Instruct-v0.1",
     "meta-llama/Llama-3.2-1B",
     "unsloth/gemma-2b",
     "ibm-granite/granite-guardian-3.1-2b",
 ]
+test_dummy_model_configs_qnn = [
+    # model_name, model_type, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params
+    ("mistralai/Mixtral-8x7B-Instruct-v0.1", "mixtral", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}),
+    (
+        "meta-llama/Llama-3.2-1B",
+        "llama",
+        128,
+        1,
+        2,
+        64,
+        256,
+        128256,
+        {
+            "num_key_value_heads": 1,
+            "rope_scaling": {
+                "factor": 32.0,
+                "high_freq_factor": 4.0,
+                "low_freq_factor": 1.0,
+                "original_max_position_embeddings": 8192,
+                "rope_type": "llama3",
+            },
+        },
+    ),
+    (
+        "unsloth/gemma-2b",
+        "gemma",
+        128,
+        1,
+        2,
+        64,
+        256,
+        256000,
+        {"num_key_value_heads": 1, "_name_or_path": "unsloth/gemma-2b"},
+    ),
+    ("ibm-granite/granite-guardian-3.1-2b", "granite", 128, 1, 2, 64, 256, 49155, {"num_key_value_heads": 1}),
+]
+test_dummy_model_configs_qnn, test_dummy_model_names_qnn = get_model_configs_and_names(test_dummy_model_configs_qnn)
 
 spd_test_models = [
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
     "Qwen/Qwen2-0.5B",
 ]
+test_dummy_model_configs_spd = [
+    # model_name, model_type, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params
+    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "llama", 128, 1, 2, 64, 256, 32000, {"num_key_value_heads": 1}),
+    ("Qwen/Qwen2-0.5B", "qwen2", 128, 1, 2, 64, 256, 151936, {"num_key_value_heads": 1}),
+]
+test_dummy_model_configs_spd, test_dummy_model_names_spd = get_model_configs_and_names(test_dummy_model_configs_spd)
 
 
 def load_causal_lm_model(model_config):
@@ -101,6 +264,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     prefill_only: Optional[bool] = None,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    model_hf: Optional[nn.Module] = None,
 ):
     """
     Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
@@ -113,8 +277,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     replace_transformers_quantizers()
     model_config = {"model_name": model_name}
     model_config["n_layer"] = n_layer
-
-    model_hf, _ = load_causal_lm_model(model_config)
+    if model_hf is None:
+        model_hf, _ = load_causal_lm_model(model_config)
+    model_hf_cb = copy.deepcopy(model_hf)
 
     tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
     config = model_hf.config
@@ -129,16 +294,13 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     )
 
     pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
-
     is_tlm = False if num_speculative_tokens is None else True
     qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm, pretrained_model_name_or_path=model_name)
-
     pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
 
     assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
         "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
     )
-
     onnx_model_path = qeff_model.export()
     ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm)
     gen_len = ort_tokens.shape[-1]
@@ -147,7 +309,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
 
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
-
     qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
@@ -174,8 +335,10 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
     if prefill_only is not None:
         return
+
     # testing for CB models
-    model_hf, _ = load_causal_lm_model(model_config)
+    model_hf = model_hf_cb
+    model_hf.eval()
     full_batch_size = 4
     fbs_prompts = Constants.INPUT_STR * 4
     api_runner = ApiRunner(
@@ -252,6 +415,28 @@ def test_causal_lm_export_with_deprecated_api(model_name):
     )
 
 
+@pytest.mark.on_qaic
+@pytest.mark.regular
+@pytest.mark.parametrize(
+    "test_dummy_model_config, test_dummy_model_name", test_dummy_model_configs, ids=test_dummy_model_names
+)
+def test_dummy_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(test_dummy_model_config, test_dummy_model_name):
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+
+    torch.manual_seed(42)
+    model_hf = AutoModelForCausalLM.from_config(
+        test_dummy_model_config,
+        attn_implementation="eager",
+    )
+    model_hf.eval()
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(test_dummy_model_name, model_hf=model_hf)
+
+
+@pytest.mark.nightly
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("model_name", test_models_qaic)
 def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
@@ -268,6 +453,36 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
 
 
+@pytest.mark.on_qaic
+@pytest.mark.regular
+@pytest.mark.qnn
+@pytest.mark.parametrize(
+    "test_dummy_model_config_qnn, test_dummy_model_name_qnn",
+    test_dummy_model_configs_qnn,
+    ids=test_dummy_model_names_qnn,
+)
+def test_dummy_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(test_dummy_model_config_qnn, test_dummy_model_name_qnn):
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+
+    torch.manual_seed(42)
+    model_hf = AutoModelForCausalLM.from_config(
+        test_dummy_model_config_qnn,
+        attn_implementation="eager",
+    )
+    model_hf.eval()
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+        test_dummy_model_name_qnn, enable_qnn=True, qnn_config=qnn_config_json_path, model_hf=model_hf
+    )
+
+
+@pytest.mark.nightly
 @pytest.mark.on_qaic
 @pytest.mark.qnn
 @pytest.mark.parametrize("model_name", test_models_qnn)
@@ -292,6 +507,35 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):
 
 
 @pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
+@pytest.mark.regular
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize(
+    "test_dummy_model_config_spd, test_dummy_model_name_spd",
+    test_dummy_model_configs_spd,
+    ids=test_dummy_model_names_spd,
+)
+def test_dummy_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(test_dummy_model_config_spd, test_dummy_model_name_spd):
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+
+    torch.manual_seed(42)
+    model_hf = AutoModelForCausalLM.from_config(
+        test_dummy_model_config_spd,
+        attn_implementation="eager",
+    )
+    model_hf.eval()
+
+    check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=test_dummy_model_name_spd, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS, model_hf=model_hf
+    )
+
+
+@pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
+@pytest.mark.nightly
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("model_name", spd_test_models)
 def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
@@ -316,7 +560,7 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
     """
     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
     """
-    model_name = "gpt2"
+    model_name = "gpt2"  # hf-internal-testing/tiny-random-gpt2
     prompt_len = 1
 
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)