diff --git a/SYCL/DeviceLib/built-ins/ext_native_math.cpp b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
index 9cc5096b38..4ab4dcf4d1 100644
--- a/SYCL/DeviceLib/built-ins/ext_native_math.cpp
+++ b/SYCL/DeviceLib/built-ins/ext_native_math.cpp
@@ -3,88 +3,36 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// OpenCL CPU driver does not support cl_khr_fp16 extension for this reason this
-// test is compiled with the -fsycl-device-code-split flag
+// Tests oneapi extension native tanh math function for sycl::vec and
+// sycl::marray float cases.
 
-#include <cassert>
-#include <sycl/sycl.hpp>
-
-template <typename T> void assert_out_of_bound(T val, T lower, T upper) {
-  assert(sycl::all(lower < val && val < upper));
-}
-
-template <>
-void assert_out_of_bound<float>(float val, float lower, float upper) {
-  assert(lower < val && val < upper);
-}
-
-template <>
-void assert_out_of_bound<sycl::half>(sycl::half val, sycl::half lower,
-                                     sycl::half upper) {
-  assert(lower < val && val < upper);
-}
-
-template <typename T>
-void native_tanh_tester(sycl::queue q, T val, T up, T lo) {
-  T r = val;
-
-#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
-  {
-    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
-    q.submit([&](sycl::handler &cgh) {
-      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
-      cgh.single_task([=]() {
-        AccR[0] = sycl::ext::oneapi::experimental::native::tanh(AccR[0]);
-      });
-    });
-  }
-
-  assert_out_of_bound(r, up, lo);
-#else
-  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
-#endif
-}
-
-template <typename T>
-void native_exp2_tester(sycl::queue q, T val, T up, T lo) {
-  T r = val;
-
-#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
-  {
-    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
-    q.submit([&](sycl::handler &cgh) {
-      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
-      cgh.single_task([=]() {
-        AccR[0] = sycl::ext::oneapi::experimental::native::exp2(AccR[0]);
-      });
-    });
-  }
-
-  assert_out_of_bound(r, up, lo);
-#else
-  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
-#endif
-}
+#include "ext_native_math_common.hpp"
 
 int main() {
 
   sycl::queue q;
 
-  const double tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
-                         -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
-  const double tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
-                         -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
-  const double tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
-                         -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
+  const float tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+                        -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
+  const float tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95, 0.89, 0.75,  -0.1,
+                        -0.94, 0.92,  -0.84, 0.82, -1.0, 0.98, -1.10, 0.98};
+  const float tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91, 0.77,  0.1,
+                        -0.92, 0.94,  -0.82, 0.84, -0.98, 1.00, -0.98, 1.10};
 
   native_tanh_tester<float>(q, tv[0], tl[0], tu[0]);
   native_tanh_tester<sycl::float2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
                                    {tu[0], tu[1]});
   native_tanh_tester<sycl::float3>(
       q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+
   native_tanh_tester<sycl::float4>(q, {tv[0], tv[1], tv[2], tv[3]},
                                    {tl[0], tl[1], tl[2], tl[3]},
                                    {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::marray<float, 3>>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::marray<float, 4>>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                             {tl[0], tl[1], tl[2], tl[3]},
+                                             {tu[0], tu[1], tu[2], tu[3]});
   native_tanh_tester<sycl::float8>(
       q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
       {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
@@ -98,57 +46,5 @@ int main() {
       {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
        tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
 
-  if (q.get_device().has(sycl::aspect::fp16)) {
-
-    native_tanh_tester<sycl::half>(q, tv[0], tl[0], tu[0]);
-    native_tanh_tester<sycl::half2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
-                                    {tu[0], tu[1]});
-    native_tanh_tester<sycl::half3>(
-        q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
-    native_tanh_tester<sycl::half4>(q, {tv[0], tv[1], tv[2], tv[3]},
-                                    {tl[0], tl[1], tl[2], tl[3]},
-                                    {tu[0], tu[1], tu[2], tu[3]});
-    native_tanh_tester<sycl::half8>(
-        q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
-        {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
-        {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
-    native_tanh_tester<sycl::half16>(
-        q,
-        {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
-         tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
-        {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
-         tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
-        {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
-         tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
-
-    const double ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
-                           -2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0};
-    const double el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
-                           0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9};
-    const double eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
-                           0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1};
-
-    native_exp2_tester<sycl::half>(q, ev[0], el[0], eu[0]);
-    native_exp2_tester<sycl::half2>(q, {ev[0], ev[1]}, {el[0], el[1]},
-                                    {eu[0], eu[1]});
-    native_exp2_tester<sycl::half3>(
-        q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
-    native_exp2_tester<sycl::half4>(q, {ev[0], ev[1], ev[2], ev[3]},
-                                    {el[0], el[1], el[2], el[3]},
-                                    {eu[0], eu[1], eu[2], eu[3]});
-    native_exp2_tester<sycl::half8>(
-        q, {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7]},
-        {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7]},
-        {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7]});
-    native_exp2_tester<sycl::half16>(
-        q,
-        {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7], ev[8], ev[9],
-         ev[10], ev[11], ev[12], ev[13], ev[14], ev[15]},
-        {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7], el[8], el[9],
-         el[10], el[11], el[12], el[13], el[14], el[15]},
-        {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7], eu[8], eu[9],
-         eu[10], eu[11], eu[12], eu[13], eu[14], eu[15]});
-  }
-
   return 0;
 }
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp b/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp
new file mode 100644
index 0000000000..7511f03206
--- /dev/null
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_common.hpp
@@ -0,0 +1,67 @@
+#include <cassert>
+#include <sycl/sycl.hpp>
+
+template <typename T, size_t N>
+void assert_out_of_bound(sycl::marray<T, N> val, sycl::marray<T, N> lower,
+                         sycl::marray<T, N> upper) {
+  for (int i = 0; i < N; i++) {
+    assert(lower[i] < val[i] && val[i] < upper[i]);
+  }
+}
+
+template <typename T> void assert_out_of_bound(T val, T lower, T upper) {
+  assert(sycl::all(lower < val && val < upper));
+}
+
+template <>
+void assert_out_of_bound<float>(float val, float lower, float upper) {
+  assert(lower < val && val < upper);
+}
+
+template <>
+void assert_out_of_bound<sycl::half>(sycl::half val, sycl::half lower,
+                                     sycl::half upper) {
+  assert(lower < val && val < upper);
+}
+
+template <typename T>
+void native_tanh_tester(sycl::queue q, T val, T up, T lo) {
+  T r = val;
+
+#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
+  {
+    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
+    q.submit([&](sycl::handler &cgh) {
+      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
+      cgh.single_task([=]() {
+        AccR[0] = sycl::ext::oneapi::experimental::native::tanh(AccR[0]);
+      });
+    });
+  }
+
+  assert_out_of_bound(r, up, lo);
+#else
+  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
+#endif
+}
+
+template <typename T>
+void native_exp2_tester(sycl::queue q, T val, T up, T lo) {
+  T r = val;
+
+#ifdef SYCL_EXT_ONEAPI_NATIVE_MATH
+  {
+    sycl::buffer<T, 1> BufR(&r, sycl::range<1>(1));
+    q.submit([&](sycl::handler &cgh) {
+      auto AccR = BufR.template get_access<sycl::access::mode::read_write>(cgh);
+      cgh.single_task([=]() {
+        AccR[0] = sycl::ext::oneapi::experimental::native::exp2(AccR[0]);
+      });
+    });
+  }
+
+  assert_out_of_bound(r, up, lo);
+#else
+  assert(!"SYCL_EXT_ONEAPI_NATIVE_MATH not supported");
+#endif
+}
diff --git a/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
new file mode 100644
index 0000000000..74bd0f8213
--- /dev/null
+++ b/SYCL/DeviceLib/built-ins/ext_native_math_fp16.cpp
@@ -0,0 +1,93 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// OpenCL CPU driver does not support cl_khr_fp16 extension for this reason this
+// test is compiled with the -fsycl-device-code-split flag
+
+// Tests oneapi extension native math functions for sycl::vec and sycl::marray
+// fp16 cases.
+
+#include "ext_native_math_common.hpp"
+
+int main() {
+
+  sycl::queue q;
+
+  if (!q.get_device().has(sycl::aspect::fp16)) {
+    std::cout << "skipping fp16 tests: requires fp16 device aspect."
+              << std::endl;
+    return 0;
+  }
+
+  const sycl::half tv[16] = {-2.0, -1.5, -1.0, 0.0, 2.0,  1.5, 1.0,   0.0,
+                             -1.7, 1.7,  -1.2, 1.2, -3.0, 3.0, -10.0, 10.0};
+  const sycl::half tl[16] = {-0.97, -0.91, -0.77, -0.1, 0.95,  0.89,
+                             0.75,  -0.1,  -0.94, 0.92, -0.84, 0.82,
+                             -1.0,  0.98,  -1.10, 0.98};
+  const sycl::half tu[16] = {-0.95, -0.89, -0.75, 0.1,  0.97,  0.91,
+                             0.77,  0.1,   -0.92, 0.94, -0.82, 0.84,
+                             -0.98, 1.00,  -0.98, 1.10};
+
+  native_tanh_tester<sycl::half>(q, tv[0], tl[0], tu[0]);
+  native_tanh_tester<sycl::half2>(q, {tv[0], tv[1]}, {tl[0], tl[1]},
+                                  {tu[0], tu[1]});
+  native_tanh_tester<sycl::half3>(q, {tv[0], tv[1], tv[2]},
+                                  {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::marray<sycl::half, 3>>(
+      q, {tv[0], tv[1], tv[2]}, {tl[0], tl[1], tl[2]}, {tu[0], tu[1], tu[2]});
+  native_tanh_tester<sycl::half4>(q, {tv[0], tv[1], tv[2], tv[3]},
+                                  {tl[0], tl[1], tl[2], tl[3]},
+                                  {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::marray<sycl::half, 4>>(
+      q, {tv[0], tv[1], tv[2], tv[3]}, {tl[0], tl[1], tl[2], tl[3]},
+      {tu[0], tu[1], tu[2], tu[3]});
+  native_tanh_tester<sycl::half8>(
+      q, {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7]});
+  native_tanh_tester<sycl::half16>(
+      q,
+      {tv[0], tv[1], tv[2], tv[3], tv[4], tv[5], tv[6], tv[7], tv[8], tv[9],
+       tv[10], tv[11], tv[12], tv[13], tv[14], tv[15]},
+      {tl[0], tl[1], tl[2], tl[3], tl[4], tl[5], tl[6], tl[7], tl[8], tl[9],
+       tl[10], tl[11], tl[12], tl[13], tl[14], tl[15]},
+      {tu[0], tu[1], tu[2], tu[3], tu[4], tu[5], tu[6], tu[7], tu[8], tu[9],
+       tu[10], tu[11], tu[12], tu[13], tu[14], tu[15]});
+
+  const sycl::half ev[16] = {-2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0,
+                             -2.0, -1.5, -1.0, 0.0, 2.0, 1.5, 1.0, 0.0};
+  const sycl::half el[16] = {0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9,
+                             0.1, 0.34, 0.4, -0.9, 3.9, 2.7, 1.9, -0.9};
+  const sycl::half eu[16] = {0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1,
+                             0.3, 0.36, 0.6, 1.1, 4.1, 2.9, 2.1, 1.1};
+
+  native_exp2_tester<sycl::half>(q, ev[0], el[0], eu[0]);
+  native_exp2_tester<sycl::half2>(q, {ev[0], ev[1]}, {el[0], el[1]},
+                                  {eu[0], eu[1]});
+  native_exp2_tester<sycl::half3>(q, {ev[0], ev[1], ev[2]},
+                                  {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
+  native_exp2_tester<sycl::half4>(q, {ev[0], ev[1], ev[2], ev[3]},
+                                  {el[0], el[1], el[2], el[3]},
+                                  {eu[0], eu[1], eu[2], eu[3]});
+  native_exp2_tester<sycl::marray<sycl::half, 3>>(
+      q, {ev[0], ev[1], ev[2]}, {el[0], el[1], el[2]}, {eu[0], eu[1], eu[2]});
+  native_exp2_tester<sycl::marray<sycl::half, 4>>(
+      q, {ev[0], ev[1], ev[2], ev[3]}, {el[0], el[1], el[2], el[3]},
+      {eu[0], eu[1], eu[2], eu[3]});
+  native_exp2_tester<sycl::half8>(
+      q, {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7]},
+      {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7]},
+      {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7]});
+  native_exp2_tester<sycl::half16>(
+      q,
+      {ev[0], ev[1], ev[2], ev[3], ev[4], ev[5], ev[6], ev[7], ev[8], ev[9],
+       ev[10], ev[11], ev[12], ev[13], ev[14], ev[15]},
+      {el[0], el[1], el[2], el[3], el[4], el[5], el[6], el[7], el[8], el[9],
+       el[10], el[11], el[12], el[13], el[14], el[15]},
+      {eu[0], eu[1], eu[2], eu[3], eu[4], eu[5], eu[6], eu[7], eu[8], eu[9],
+       eu[10], eu[11], eu[12], eu[13], eu[14], eu[15]});
+
+  return 0;
+}
diff --git a/SYCL/DeviceLib/built-ins/fast-math-flag.cpp b/SYCL/DeviceLib/built-ins/fast-math-flag.cpp
index e6b7d08238..d90845e58d 100644
--- a/SYCL/DeviceLib/built-ins/fast-math-flag.cpp
+++ b/SYCL/DeviceLib/built-ins/fast-math-flag.cpp
@@ -6,47 +6,64 @@
 #include <cassert>
 #include <sycl/sycl.hpp>
 
-#define __TEST_FFMATH_BINARY(func)                                             \
-  int test_ffmath_##func() {                                                   \
-    sycl::float4 r[2];                                                         \
-    sycl::float4 val[2] = {{1.0004f, 1e-4f, 1.4f, 14.0f},                      \
-                           {1.0004f, 1e-4f, 1.4f, 14.0f}};                     \
+using namespace sycl;
+
+template <typename T> bool checkEqual(vec<T, 4> A, vec<T, 4> B) {
+
+  return sycl::all(A == B);
+}
+
+template <typename T, size_t N>
+bool checkEqual(marray<T, N> A, marray<T, N> B) {
+  for (int i = 0; i < N; i++) {
+    if (A[i] != B[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define __TEST_FFMATH_UNARY(func)                                              \
+  template <typename T> void test_ffmath_##func(queue &deviceQueue) {          \
+    T input{1.0004f, 1e-4f, 1.4f, 14.0f};                                      \
+    T res[2] = {{-1, -1, -1, -1}, {-2, -2, -2, -2}};                           \
     {                                                                          \
-      sycl::buffer<sycl::float4, 1> output(&r[0], sycl::range<1>(2));          \
-      sycl::buffer<sycl::float4, 1> input(&val[0], sycl::range<1>(2));         \
-      sycl::queue q;                                                           \
-      q.submit([&](sycl::handler &cgh) {                                       \
-        auto AccO =                                                            \
-            output.template get_access<sycl::access::mode::write>(cgh);        \
-        auto AccI = input.template get_access<sycl::access::mode::read>(cgh);  \
+      buffer<T, 1> input_buff(&input, 1);                                      \
+      buffer<T, 1> res_buff(&res[0], sycl::range<1>(2));                       \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_acc(res_buff,  \
+                                                                    cgh);      \
+        accessor<T, 1, access::mode::read, target::device> input_acc(          \
+            input_buff, cgh);                                                  \
         cgh.single_task([=]() {                                                \
-          AccO[0] = sycl::func(AccI[0], AccI[1]);                              \
-          AccO[1] = sycl::native::func(AccI[0], AccI[1]);                      \
+          res_acc[0] = sycl::native::func(input_acc[0]);                       \
+          res_acc[1] = sycl::func(input_acc[0]);                               \
         });                                                                    \
       });                                                                      \
     }                                                                          \
-    return sycl::all(r[0] == r[1]);                                            \
+    assert(checkEqual(res[0], res[1]));                                        \
   }
 
-#define __TEST_FFMATH_UNARY(func)                                              \
-  int test_ffmath_##func() {                                                   \
-    sycl::float4 val = {1.0004f, 1e-4f, 1.4f, 14.0f};                          \
-    sycl::float4 r[2];                                                         \
+#define __TEST_FFMATH_BINARY(func)                                             \
+  template <typename T> void test_ffmath_##func(queue &deviceQueue) {          \
+    T input[2] = {{1.0004f, 1e-4f, 1.4f, 14.0f},                               \
+                  {1.0004f, 1e-4f, 1.4f, 14.0f}};                              \
+    T res[2] = {{-1, -1, -1, -1}, {-2, -2, -2, -2}};                           \
     {                                                                          \
-      sycl::buffer<sycl::float4, 1> output(&r[0], sycl::range<1>(2));          \
-      sycl::buffer<sycl::float4, 1> input(&val, sycl::range<1>(1));            \
-      sycl::queue q;                                                           \
-      q.submit([&](sycl::handler &cgh) {                                       \
-        auto AccO =                                                            \
-            output.template get_access<sycl::access::mode::write>(cgh);        \
-        auto AccI = input.template get_access<sycl::access::mode::read>(cgh);  \
+      buffer<T, 1> input_buff(&input[0], range<1>(2));                         \
+      buffer<T, 1> res_buff(&res[0], range<1>(2));                             \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_acc(res_buff,  \
+                                                                    cgh);      \
+        accessor<T, 1, access::mode::read, target::device> input_acc(          \
+            input_buff, cgh);                                                  \
         cgh.single_task([=]() {                                                \
-          AccO[0] = sycl::func(AccI[0]);                                       \
-          AccO[1] = sycl::native::func(AccI[0]);                               \
+          res_acc[0] = sycl::native::func(input_acc[0], input_acc[1]);         \
+          res_acc[1] = sycl::func(input_acc[0], input_acc[1]);                 \
         });                                                                    \
       });                                                                      \
     }                                                                          \
-    return sycl::all(r[0] == r[1]);                                            \
+    assert(checkEqual(res[0], res[1]));                                        \
   }
 
 __TEST_FFMATH_UNARY(cos)
@@ -56,26 +73,43 @@ __TEST_FFMATH_UNARY(exp10)
 __TEST_FFMATH_UNARY(log)
 __TEST_FFMATH_UNARY(log2)
 __TEST_FFMATH_UNARY(log10)
-__TEST_FFMATH_BINARY(powr)
 __TEST_FFMATH_UNARY(rsqrt)
 __TEST_FFMATH_UNARY(sin)
 __TEST_FFMATH_UNARY(sqrt)
 __TEST_FFMATH_UNARY(tan)
 
+__TEST_FFMATH_BINARY(powr)
+
 int main() {
 
-  assert(test_ffmath_cos());
-  assert(test_ffmath_exp());
-  assert(test_ffmath_exp2());
-  assert(test_ffmath_exp10());
-  assert(test_ffmath_log());
-  assert(test_ffmath_log2());
-  assert(test_ffmath_log10());
-  assert(test_ffmath_powr());
-  assert(test_ffmath_rsqrt());
-  assert(test_ffmath_sin());
-  assert(test_ffmath_sqrt());
-  assert(test_ffmath_tan());
+  queue q;
+  test_ffmath_cos<marray<float, 4>>(q);
+  test_ffmath_exp<marray<float, 4>>(q);
+  test_ffmath_exp2<marray<float, 4>>(q);
+  test_ffmath_exp10<marray<float, 4>>(q);
+  test_ffmath_log<marray<float, 4>>(q);
+  test_ffmath_log2<marray<float, 4>>(q);
+  test_ffmath_log10<marray<float, 4>>(q);
+  test_ffmath_powr<marray<float, 4>>(q);
+  test_ffmath_rsqrt<marray<float, 4>>(q);
+  test_ffmath_sin<marray<float, 4>>(q);
+  test_ffmath_sqrt<marray<float, 4>>(q);
+  test_ffmath_tan<marray<float, 4>>(q);
+  test_ffmath_powr<marray<float, 4>>(q);
+
+  test_ffmath_cos<float4>(q);
+  test_ffmath_exp<float4>(q);
+  test_ffmath_exp2<float4>(q);
+  test_ffmath_exp10<float4>(q);
+  test_ffmath_log<float4>(q);
+  test_ffmath_log2<float4>(q);
+  test_ffmath_log10<float4>(q);
+  test_ffmath_powr<float4>(q);
+  test_ffmath_rsqrt<float4>(q);
+  test_ffmath_sin<float4>(q);
+  test_ffmath_sqrt<float4>(q);
+  test_ffmath_tan<float4>(q);
+  test_ffmath_powr<float4>(q);
 
   return 0;
 }
diff --git a/SYCL/DeviceLib/math_test_marray_vec.cpp b/SYCL/DeviceLib/math_test_marray_vec.cpp
new file mode 100644
index 0000000000..18dfc1ae00
--- /dev/null
+++ b/SYCL/DeviceLib/math_test_marray_vec.cpp
@@ -0,0 +1,31 @@
+// TODO fix windows failures
+// UNSUPPORTED: windows && (level_zero || opencl)
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// tests sycl floating point math functions for sycl::vec and sycl::marray float
+// and double cases.
+
+#include "math_test_marray_vec_common.hpp"
+
+int main() {
+  queue deviceQueue;
+  math_tests_4<float4>(deviceQueue);
+  math_tests_4<marray<float, 4>>(deviceQueue);
+
+  math_tests_3<float3>(deviceQueue);
+  math_tests_3<marray<float, 3>>(deviceQueue);
+
+  if (deviceQueue.get_device().has(sycl::aspect::fp64)) {
+    math_tests_4<double4>(deviceQueue);
+    math_tests_4<marray<double, 4>>(deviceQueue);
+
+    math_tests_3<double3>(deviceQueue);
+    math_tests_3<marray<double, 3>>(deviceQueue);
+  }
+
+  std::cout << "Pass" << std::endl;
+  return 0;
+}
diff --git a/SYCL/DeviceLib/math_test_marray_vec_common.hpp b/SYCL/DeviceLib/math_test_marray_vec_common.hpp
new file mode 100644
index 0000000000..1c10543f68
--- /dev/null
+++ b/SYCL/DeviceLib/math_test_marray_vec_common.hpp
@@ -0,0 +1,250 @@
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T1, typename T2> class TypeHelper;
+
+template <typename T> bool checkEqual(vec<T, 3> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB;
+}
+
+template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
+}
+
+template <typename T, size_t N> bool checkEqual(marray<T, N> A, size_t B) {
+  for (int i = 0; i < N; i++) {
+    if (A[i] != B) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define OPERATOR(NAME)                                                         \
+  template <typename T>                                                        \
+  void math_test_##NAME(queue &deviceQueue, T result, T input, size_t ref) {   \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input, 1);                                         \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::read, target::device> input_access(       \
+            buffer2, cgh);                                                     \
+        cgh.single_task<TypeHelper<class a##NAME, T>>(                         \
+            [=]() { res_access[0] = NAME(input_access[0]); });                 \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+OPERATOR(cos)
+OPERATOR(cospi)
+OPERATOR(sin)
+OPERATOR(sinpi)
+OPERATOR(cosh)
+OPERATOR(sinh)
+OPERATOR(tan)
+OPERATOR(tanpi)
+OPERATOR(atan)
+OPERATOR(atanpi)
+OPERATOR(tanh)
+OPERATOR(acos)
+OPERATOR(acospi)
+OPERATOR(asin)
+OPERATOR(asinpi)
+OPERATOR(acosh)
+OPERATOR(asinh)
+OPERATOR(atanh)
+OPERATOR(cbrt)
+OPERATOR(ceil)
+OPERATOR(exp)
+OPERATOR(exp2)
+OPERATOR(exp10)
+OPERATOR(expm1)
+OPERATOR(tgamma)
+OPERATOR(lgamma)
+OPERATOR(erf)
+OPERATOR(erfc)
+OPERATOR(log)
+OPERATOR(log2)
+OPERATOR(log10)
+OPERATOR(log1p)
+OPERATOR(logb)
+OPERATOR(sqrt)
+OPERATOR(rsqrt)
+OPERATOR(rint)
+OPERATOR(round)
+OPERATOR(trunc)
+
+#undef OPERATOR
+
+#define OPERATOR_2(NAME)                                                       \
+  template <typename T>                                                        \
+  void math_test_2_##NAME(queue &deviceQueue, T result, T input1, T input2,    \
+                          size_t ref) {                                        \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input1, 1);                                        \
+      buffer<T, 1> buffer3(&input2, 1);                                        \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::read, target::device> input1_access(      \
+            buffer2, cgh);                                                     \
+        accessor<T, 1, access::mode::read, target::device> input2_access(      \
+            buffer3, cgh);                                                     \
+        cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
+          res_access[0] = NAME(input1_access[0], input2_access[0]);            \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+OPERATOR_2(pow)
+OPERATOR_2(powr)
+OPERATOR_2(atan2)
+OPERATOR_2(atan2pi)
+OPERATOR_2(copysign)
+OPERATOR_2(fdim)
+OPERATOR_2(fmin)
+OPERATOR_2(fmax)
+OPERATOR_2(fmod)
+OPERATOR_2(hypot)
+OPERATOR_2(maxmag)
+OPERATOR_2(minmag)
+OPERATOR_2(nextafter)
+OPERATOR_2(remainder)
+
+#undef OPERATOR_2
+
+#define OPERATOR_3(NAME)                                                       \
+  template <typename T>                                                        \
+  void math_test_3_##NAME(queue &deviceQueue, T result, T input1, T input2,    \
+                          T input3, size_t ref) {                              \
+    {                                                                          \
+      buffer<T, 1> buffer1(&result, 1);                                        \
+      buffer<T, 1> buffer2(&input1, 1);                                        \
+      buffer<T, 1> buffer3(&input2, 1);                                        \
+      buffer<T, 1> buffer4(&input3, 1);                                        \
+      deviceQueue.submit([&](handler &cgh) {                                   \
+        accessor<T, 1, access::mode::write, target::device> res_access(        \
+            buffer1, cgh);                                                     \
+        accessor<T, 1, access::mode::read, target::device> input1_access(      \
+            buffer2, cgh);                                                     \
+        accessor<T, 1, access::mode::read, target::device> input2_access(      \
+            buffer3, cgh);                                                     \
+        accessor<T, 1, access::mode::read, target::device> input3_access(      \
+            buffer4, cgh);                                                     \
+        cgh.single_task<TypeHelper<class a##NAME, T>>([=]() {                  \
+          res_access[0] =                                                      \
+              NAME(input1_access[0], input2_access[0], input3_access[0]);      \
+        });                                                                    \
+      });                                                                      \
+    }                                                                          \
+    assert(checkEqual(result, ref));                                           \
+  }
+
+OPERATOR_3(mad)
+OPERATOR_3(mix)
+OPERATOR_3(fma)
+
+#undef OPERATOR_3
+
+template <typename T> void math_tests_4(queue &deviceQueue) {
+  math_test_tanh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_cosh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  math_test_sinh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_acos(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_acospi(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_acosh(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_asin(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_asinpi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_asinh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_cbrt(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 1);
+  math_test_atan(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_atanpi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_atanh(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_exp(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  math_test_exp2(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 4);
+  math_test_exp10(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, 100);
+  math_test_expm1(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_ceil(deviceQueue, T{-1, -1, -1, -1}, T{0.6, 0.6, 0.6, 0.6}, 1);
+  math_test_tgamma(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 1);
+  math_test_lgamma(deviceQueue, T{-1, -1, -1, -1}, T{1, 1, 1, 1}, 0);
+  math_test_erf(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 0);
+  math_test_erfc(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0}, 1);
+  math_test_2_pow(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                  4);
+  math_test_2_powr(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                   4);
+  math_test_2_atan2(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                    T{2, 2, 2, 2}, 0);
+  math_test_2_atan2pi(deviceQueue, T{-1, -1, -1, -1}, T{0, 0, 0, 0},
+                      T{2, 2, 2, 2}, 0);
+  math_test_2_copysign(deviceQueue, T{-1, -1, -1, -1}, T{-3, -3, -3, -3},
+                       T{2, 2, 2, 2}, 3);
+  math_test_2_fmin(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{3, 3, 3, 3},
+                   2);
+  math_test_2_fmax(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{3, 3, 3, 3},
+                   3);
+  math_test_2_hypot(deviceQueue, T{-1, -1, -1, -1}, T{4, 4, 4, 4},
+                    T{3, 3, 3, 3}, 5);
+  math_test_2_maxmag(deviceQueue, T{-1, -1, -1, -1}, T{-2, -2, -2, -2},
+                     T{3, 3, 3, 3}, 3);
+  math_test_2_minmag(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2},
+                     T{-3, -3, -3, -3}, 2);
+  math_test_2_remainder(deviceQueue, T{-1, -1, -1, -1}, T{5, 5, 5, 5},
+                        T{2, 2, 2, 2}, 1);
+  math_test_2_fdim(deviceQueue, T{-1, -1, -1, -1}, T{3, 3, 3, 3}, T{3, 3, 3, 3},
+                   0);
+  math_test_2_fmod(deviceQueue, T{-1, -1, -1, -1}, T{5, 5, 5, 5}, T{3, 3, 3, 3},
+                   2);
+  math_test_2_nextafter(deviceQueue, T{-1, -1, -1, -1}, T{-0, -0, -0, -0},
+                        T{+0, +0, +0, +0}, 0);
+  math_test_3_fma(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                  T{1, 1, 1, 1}, 5);
+  math_test_3_mad(deviceQueue, T{-1, -1, -1, -1}, T{2, 2, 2, 2}, T{2, 2, 2, 2},
+                  T{1, 1, 1, 1}, 5);
+  math_test_3_mix(deviceQueue, T{-1, -1, -1, -1}, T{3, 3, 3, 3}, T{5, 5, 5, 5},
+                  T{0.5, 0.5, 0.5, 0.5}, 4);
+}
+
+template <typename T> void math_tests_3(queue &deviceQueue) {
+  math_test_tan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_tanh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_cos(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_sin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_cosh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_sinh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_acos(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_acosh(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_asin(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_asinh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_cbrt(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 1);
+  math_test_atan(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_atanh(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_exp(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_exp2(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 4);
+  math_test_exp10(deviceQueue, T{-1, -1, -1}, T{2, 2, 2}, 100);
+  math_test_expm1(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_ceil(deviceQueue, T{-1, -1, -1}, T{0.6, 0.6, 0.6}, 1);
+  math_test_tgamma(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 1);
+  math_test_lgamma(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_erf(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_erfc(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 1);
+  math_test_log(deviceQueue, T{-1, -1, -1}, T{1, 1, 1}, 0);
+  math_test_log2(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  math_test_log10(deviceQueue, T{-1, -1, -1}, T{100, 100, 100}, 2);
+  math_test_log1p(deviceQueue, T{-1, -1, -1}, T{0, 0, 0}, 0);
+  math_test_logb(deviceQueue, T{-1, -1, -1}, T{1.1, 1.1, 1.1}, 0);
+  math_test_sqrt(deviceQueue, T{-1, -1, -1}, T{4, 4, 4}, 2);
+  math_test_rsqrt(deviceQueue, T{-1, -1, -1}, T{0.25, 0.25, 0.25}, 2);
+  math_test_rint(deviceQueue, T{-1, -1, -1}, T{2.9, 2.9, 2.9}, 3);
+  math_test_round(deviceQueue, T{-1, -1, -1}, T{0.5, 0.5, 0.5}, 1);
+  math_test_trunc(deviceQueue, T{-1, -1, -1}, T{1.9, 1.9, 1.9}, 1);
+}
diff --git a/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp b/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
new file mode 100644
index 0000000000..7db7350ece
--- /dev/null
+++ b/SYCL/DeviceLib/math_test_marray_vec_fp16.cpp
@@ -0,0 +1,26 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// tests sycl floating point math functions for sycl::vec and sycl::marray fp16
+// cases.
+
+#include "math_test_marray_vec_common.hpp"
+
+int main() {
+  queue deviceQueue;
+
+  if (!deviceQueue.get_device().has(sycl::aspect::fp16)) {
+    std::cout << "skipping fp16 tests: requires fp16 device aspect."
+              << std::endl;
+    return 0;
+  }
+  math_tests_4<half4>(deviceQueue);
+  math_tests_4<marray<half, 4>>(deviceQueue);
+  math_tests_3<half3>(deviceQueue);
+  math_tests_3<marray<half, 3>>(deviceQueue);
+
+  std::cout << "Pass" << std::endl;
+  return 0;
+}