From 70f684253ff02bb1f53442c7bc315c2973a75084 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 23 Apr 2022 22:20:54 -0500
Subject: [PATCH 01/11] onemkl_gemv example expanded: chebyshev example working

---
 examples/pybind11/onemkl_gemv/chebyshev.py    | 128 ++++++++
 .../onemkl_gemv/sycl_gemm/__init__.py         |   4 +-
 .../onemkl_gemv/sycl_gemm/_onemkl.cpp         | 305 +++++++++++++++++-
 3 files changed, 434 insertions(+), 3 deletions(-)
 create mode 100644 examples/pybind11/onemkl_gemv/chebyshev.py

diff --git a/examples/pybind11/onemkl_gemv/chebyshev.py b/examples/pybind11/onemkl_gemv/chebyshev.py
new file mode 100644
index 0000000000..36cf4fbbf0
--- /dev/null
+++ b/examples/pybind11/onemkl_gemv/chebyshev.py
@@ -0,0 +1,128 @@
+import numpy as np
+import sycl_gemm
+
+import dpctl
+import dpctl.tensor as dpt
+
+
+def empty_like(A):
+    return dpt.empty(A.shape, A.dtype, device=A.device)
+
+
+def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]):
+    """Chebyshev iterative solver using SYCL routines"""
+    d = (lMax + lMin) / 2
+    c = (lMax - lMin) / 2
+
+    x = dpt.copy(x0)
+    exec_queue = A.sycl_queue
+    assert exec_queue == x.sycl_queue
+    Ax = empty_like(A[:, 0])
+    r = empty_like(Ax)
+    p = empty_like(Ax)
+
+    e_x = dpctl.SyclEvent()
+    he_dot, e_dot = sycl_gemm.gemv(
+        exec_queue, A, x, Ax, depends=depends
+    )  # Ax = A @ x
+    he_sub, e_sub = sycl_gemm.sub(
+        exec_queue, b, Ax, r, depends=[e_dot]
+    )  # r = b - Ax
+    r_ev = e_sub
+    for i in range(nIters):
+        z = r
+        z_ev = r_ev
+        if i == 0:
+            p[:] = z
+            alpha = 1 / d
+            he_axbpy, e_axbpy = dpctl.SyclEvent(), dpctl.SyclEvent()
+        elif i == 1:
+            beta = 0.5 * (c * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace(
+                exec_queue, 1, z, beta, p, depends=[z_ev]
+            )  # p = z + beta * p
+        else:
+            beta = (c / 2 * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace(
+                exec_queue, 1, z, beta, p, depends=[z_ev]
+            )  # p = z + beta * p
+        h_x, e_x = sycl_gemm.axbpy_inplace(
+            exec_queue, alpha, p, 1, x, depends=[e_axbpy, e_x]
+        )  # x = x + alpha * p
+        he_dot, e_dot = sycl_gemm.gemv(
+            exec_queue, A, x, Ax, depends=[e_x]
+        )  # Ax = A @ x
+        he_sub, e_sub = sycl_gemm.sub(
+            exec_queue, b, Ax, r, depends=[e_dot]
+        )  # r = b - Ax
+        residual = sycl_gemm.norm_squared_blocking(
+            exec_queue, r, depends=[e_sub]
+        )  # residual = dot(r, r)
+        if residual <= 1e-29:
+            print(f"chebyshev: converged in {i} iters")
+            break
+    return x
+
+
+def check_with_numpy(A, b):
+    """Direct solver using numpy"""
+    import numpy as np
+
+    Anp = dpt.asnumpy(A)
+    bnp = dpt.asnumpy(b)
+    return np.linalg.solve(Anp, bnp)
+
+
+def chebyshev_numpy(A, b, x0, nIters, lMax, lMin):
+    """Chebyshev iterative solver using numpy"""
+    d = (lMax + lMin) / 2
+    c = (lMax - lMin) / 2
+
+    x = x0
+
+    Ax = A @ x
+    r = b - Ax
+    for i in range(nIters):
+        z = r
+        if i == 0:
+            p = z
+            alpha = 1 / d
+        elif i == 1:
+            beta = 0.5 * (c * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            p = z + beta * p
+        else:
+            beta = (c / 2 * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            p = z + beta * p
+        x += alpha * p
+        Ax = A @ x
+        r = b - Ax
+        residual = np.dot(r, r)
+        if residual <= 1e-29:
+            print(f"chebyshev_numpy: converged in {i} iters")
+            break
+    return x
+
+
+if __name__ == "__main__":
+    A = dpt.asarray(
+        [[2, 1, 0, 0], [1, 2, 1, 0], [0, 1, 2, 1], [0, 0, 1, 2]],
+        dtype="d",
+        usm_type="device",
+    )
+    dev = A.device
+    b = dpt.asarray(
+        [0.5, 0.7, 0.3, 0.1], dtype="d", usm_type="device", device=dev
+    )
+    x0 = b
+    x = chebyshev(A, b, x0, 100, 4.0, 0.25)
+    print(dpt.asnumpy(x))
+    print(check_with_numpy(A, b))
+    print(
+        chebyshev_numpy(
+            dpt.asnumpy(A), dpt.asnumpy(b), dpt.asnumpy(b), 100, 4.0, 0.25
+        )
+    )
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
index 07d7ec6ced..df8b5129fa 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
@@ -1,3 +1,3 @@
-from ._onemkl import gemv
+from ._onemkl import axbpy_inplace, gemv, norm_squared_blocking, sub
 
-__all__ = ["gemv"]
+__all__ = ["gemv", "sub", "axbpy_inplace", "norm_squared_blocking"]
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index 569428004f..1fb7a951d6 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -3,6 +3,7 @@
 #include <oneapi/mkl.hpp>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <pybind11/complex.h>
 #include "dpctl4pybind11.hpp"
 // clang-format on
 
@@ -110,9 +111,311 @@ gemv(sycl::queue q,
     return std::make_pair(ht_event, res_ev);
 }
 
+template <typename T> class sub_kern;
+
+template <typename T>
+sycl::event sub_impl(sycl::queue q,
+                     size_t n,
+                     const char *v1_i,
+                     const char *v2_i,
+                     char *r_i,
+                     const std::vector<sycl::event> &depends = {})
+{
+    const T *v1 = reinterpret_cast<const T *>(v1_i);
+    const T *v2 = reinterpret_cast<const T *>(v2_i);
+    T *r = reinterpret_cast<T *>(r_i);
+
+    sycl::event r_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<sub_kern<T>>(sycl::range<1>{n}, [=](sycl::id<1> id) {
+            auto i = id.get(0);
+            r[i] = v1[i] - v2[i];
+        });
+    });
+
+    return r_ev;
+}
+
+// out_r = in_v1 - in_v2
+std::pair<sycl::event, sycl::event>
+sub(sycl::queue q,
+    dpctl::tensor::usm_ndarray in_v1,
+    dpctl::tensor::usm_ndarray in_v2,
+    dpctl::tensor::usm_ndarray out_r,
+    const std::vector<sycl::event> &depends = {})
+{
+    if (in_v1.get_ndim() != 1 || in_v2.get_ndim() != 1 || out_r.get_ndim() != 1)
+    {
+        throw std::runtime_error("Inconsistent dimensions, expecting vectors");
+    }
+
+    py::ssize_t n = in_v1.get_shape(0); // get length of the vector
+
+    if (n != in_v2.get_shape(0) || n != out_r.get_shape(0)) {
+        throw std::runtime_error("Vectors must have the same length");
+    }
+
+    int in_v1_flags = in_v1.get_flags();
+    int in_v2_flags = in_v2.get_flags();
+    int out_r_flags = out_r.get_flags();
+
+    if (!((in_v1_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) &&
+          (in_v2_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) &&
+          (out_r_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))))
+    {
+        throw std::runtime_error("Vectors must be contiguous.");
+    }
+
+    int in_v1_typenum = in_v1.get_typenum();
+    int in_v2_typenum = in_v2.get_typenum();
+    int out_r_typenum = out_r.get_typenum();
+
+    if ((in_v2_typenum != in_v1_typenum) || (out_r_typenum != in_v1_typenum) ||
+        !((in_v1_typenum == UAR_DOUBLE) || (in_v1_typenum == UAR_FLOAT) ||
+          (in_v1_typenum == UAR_CDOUBLE) || (in_v1_typenum == UAR_CFLOAT)))
+    {
+        throw std::runtime_error(
+            "Only real and complex floating point arrays are supported.");
+    }
+
+    const char *in_v1_typeless_ptr = in_v1.get_data();
+    const char *in_v2_typeless_ptr = in_v2.get_data();
+    char *out_r_typeless_ptr = out_r.get_data();
+
+    sycl::event res_ev;
+    if (out_r_typenum == UAR_DOUBLE) {
+        using T = double;
+        res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
+                             out_r_typeless_ptr, depends);
+    }
+    else if (out_r_typenum == UAR_FLOAT) {
+        using T = float;
+        res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
+                             out_r_typeless_ptr, depends);
+    }
+    else if (out_r_typenum == UAR_CDOUBLE) {
+        using T = std::complex<double>;
+        res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
+                             out_r_typeless_ptr, depends);
+    }
+    else if (out_r_typenum == UAR_CFLOAT) {
+        using T = std::complex<float>;
+        res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
+                             out_r_typeless_ptr, depends);
+    }
+    else {
+        throw std::runtime_error("Type dispatch ran into trouble.");
+    }
+
+    sycl::event ht_event = keep_args_alive(q, {in_v1, in_v2, out_r}, {res_ev});
+
+    return std::make_pair(ht_event, res_ev);
+}
+
+template <typename T> class axbpy_inplace_kern;
+
+template <typename T>
+sycl::event axbpy_inplace_impl(sycl::queue q,
+                               size_t nelems,
+                               py::object pyobj_a,
+                               const char *x_typeless,
+                               py::object pyobj_b,
+                               char *y_typeless,
+                               const std::vector<sycl::event> depends = {})
+{
+    T a = py::cast<T>(pyobj_a);
+    T b = py::cast<T>(pyobj_b);
+
+    const T *x = reinterpret_cast<const T *>(x_typeless);
+    T *y = reinterpret_cast<T *>(y_typeless);
+
+    sycl::event res_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<axbpy_inplace_kern<T>>(sycl::range<1>{nelems},
+                                                [=](sycl::id<1> id) {
+                                                    auto i = id.get(0);
+                                                    y[i] = a * x[i] + b * y[i];
+                                                });
+    });
+
+    return res_ev;
+}
+
+// y = a * x + b * y
+std::pair<sycl::event, sycl::event>
+axbpy_inplace(sycl::queue q,
+              py::object a,
+              dpctl::tensor::usm_ndarray x,
+              py::object b,
+              dpctl::tensor::usm_ndarray y,
+              const std::vector<sycl::event> &depends = {})
+{
+
+    if (x.get_ndim() != 1 || y.get_ndim() != 1) {
+        throw std::runtime_error("Inconsistent dimensions, expecting vectors");
+    }
+
+    py::ssize_t n = x.get_shape(0); // get length of the vector
+
+    if (n != y.get_shape(0)) {
+        throw std::runtime_error("Vectors must have the same length");
+    }
+
+    int x_flags = x.get_flags();
+    int y_flags = y.get_flags();
+
+    if (!((x_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) &&
+          (y_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))))
+    {
+        throw std::runtime_error("Vectors must be contiguous.");
+    }
+
+    int x_typenum = x.get_typenum();
+    int y_typenum = y.get_typenum();
+
+    if ((x_typenum != y_typenum) ||
+        !((x_typenum == UAR_DOUBLE) || (x_typenum == UAR_FLOAT) ||
+          (x_typenum == UAR_CDOUBLE) || (x_typenum == UAR_CFLOAT)))
+    {
+        throw std::runtime_error(
+            "Only real and complex floating point arrays are supported.");
+    }
+
+    const char *x_typeless_ptr = x.get_data();
+    char *y_typeless_ptr = y.get_data();
+
+    sycl::event res_ev;
+    if (x_typenum == UAR_DOUBLE) {
+        using T = double;
+        res_ev = axbpy_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+                                       y_typeless_ptr, depends);
+    }
+    else if (x_typenum == UAR_FLOAT) {
+        using T = float;
+        res_ev = axbpy_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+                                       y_typeless_ptr, depends);
+    }
+    else if (x_typenum == UAR_CDOUBLE) {
+        using T = std::complex<double>;
+        res_ev = axbpy_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+                                       y_typeless_ptr, depends);
+    }
+    else if (x_typenum == UAR_CFLOAT) {
+        using T = std::complex<float>;
+        res_ev = axbpy_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+                                       y_typeless_ptr, depends);
+    }
+    else {
+        throw std::runtime_error("Type dispatch ran into trouble.");
+    }
+
+    sycl::event ht_event = keep_args_alive(q, {x, y}, {res_ev});
+
+    return std::make_pair(ht_event, res_ev);
+}
+
+template <typename T> class norm_squared_blocking_kern;
+
+template <typename T>
+T norm_squared_blocking_impl(sycl::queue q,
+                             size_t nelems,
+                             const char *r_typeless,
+                             const std::vector<sycl::event> &depends = {})
+{
+    const T *r = reinterpret_cast<const T *>(r_typeless);
+
+    sycl::buffer<T, 1> sum_sq_buf(sycl::range<1>{1});
+
+    q.submit([&](sycl::handler &cgh) {
+         cgh.depends_on(depends);
+         auto sum_sq_reduction = sycl::reduction(
+             sum_sq_buf, cgh, sycl::plus<T>(),
+             {sycl::property::reduction::initialize_to_identity{}});
+         cgh.parallel_for<norm_squared_blocking_kern<T>>(
+             sycl::range<1>{nelems}, sum_sq_reduction,
+             [=](sycl::id<1> id, auto &sum_sq) {
+                 auto i = id.get(0);
+                 sum_sq += r[i] * r[i];
+             });
+     }).wait_and_throw();
+
+    sycl::host_accessor ha(sum_sq_buf);
+    return ha[0];
+}
+
+py::object norm_squared_blocking(sycl::queue q,
+                                 dpctl::tensor::usm_ndarray r,
+                                 const std::vector<sycl::event> depends = {})
+{
+    if (r.get_ndim() != 1) {
+        throw std::runtime_error("Expecting a vector");
+    }
+
+    py::ssize_t n = r.get_shape(0); // get length of the vector
+
+    int r_flags = r.get_flags();
+
+    if (!(r_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))) {
+        throw std::runtime_error("Vector must be contiguous.");
+    }
+
+    int r_typenum = r.get_typenum();
+    if ((r_typenum != UAR_DOUBLE) && (r_typenum != UAR_FLOAT) &&
+        (r_typenum != UAR_CDOUBLE) && (r_typenum != UAR_CFLOAT))
+    {
+        throw std::runtime_error(
+            "Only real and complex floating point arrays are supported.");
+    }
+
+    const char *r_typeless_ptr = r.get_data();
+    py::object res;
+
+    if (r_typenum == UAR_DOUBLE) {
+        using T = double;
+        T n_sq = norm_squared_blocking_impl<T>(q, n, r_typeless_ptr, depends);
+        res = py::float_(n_sq);
+    }
+    else if (r_typenum == UAR_FLOAT) {
+        using T = float;
+        T n_sq = norm_squared_blocking_impl<T>(q, n, r_typeless_ptr, depends);
+        res = py::float_(n_sq);
+    }
+#if 0
+    else if (r_typenum == UAR_CDOUBLE) {
+        using T = std::complex<double>;
+        double n_sq = norm_squared_blocking_impl<T>(
+	    q, n, r_typeless_ptr,
+	    depends);
+	res = py::float_(n_sq);
+    }
+    else if (r_typenum == UAR_CFLOAT) {
+        using T = std::complex<float>;
+        float n_sq = norm_squared_blocking_impl<T>(
+	    q, n, r_typeless_ptr,
+	    depends);
+	res = py::float_(n_sq);
+    }
+#endif
+    else {
+        throw std::runtime_error("Type dispatch ran into trouble.");
+    }
+
+    return res;
+}
+
 PYBIND11_MODULE(_onemkl, m)
 {
     // Import the dpctl extensions
     import_dpctl();
-    m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)");
+    m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)",
+          py::arg("exec_queue"), py::arg("Amatrix"), py::arg("xvec"),
+          py::arg("resvec"), py::arg("depends") = py::list());
+    m.def("sub", &sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"),
+          py::arg("in1"), py::arg("in2"), py::arg("out"),
+          py::arg("depends") = py::list());
+    m.def("axbpy_inplace", &axbpy_inplace, "y = a * x + b * y",
+          py::arg("exec_queue"), py::arg("a"), py::arg("x"), py::arg("b"),
+          py::arg("y"), py::arg("depends") = py::list());
+    m.def("norm_squared_blocking", &norm_squared_blocking, "norm(r)**2",
+          py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list());
 }

From c4ad438086eb553265581e522c04fb57171cbd01 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 25 Apr 2022 10:25:50 -0500
Subject: [PATCH 02/11] Implemented conjugate gradient linear solver

---
 examples/pybind11/onemkl_gemv/chebyshev.py    | 128 ----------
 examples/pybind11/onemkl_gemv/solve.py        | 229 ++++++++++++++++++
 .../onemkl_gemv/sycl_gemm/__init__.py         |  16 +-
 .../onemkl_gemv/sycl_gemm/_onemkl.cpp         | 137 ++++++++++-
 4 files changed, 370 insertions(+), 140 deletions(-)
 delete mode 100644 examples/pybind11/onemkl_gemv/chebyshev.py
 create mode 100644 examples/pybind11/onemkl_gemv/solve.py

diff --git a/examples/pybind11/onemkl_gemv/chebyshev.py b/examples/pybind11/onemkl_gemv/chebyshev.py
deleted file mode 100644
index 36cf4fbbf0..0000000000
--- a/examples/pybind11/onemkl_gemv/chebyshev.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import numpy as np
-import sycl_gemm
-
-import dpctl
-import dpctl.tensor as dpt
-
-
-def empty_like(A):
-    return dpt.empty(A.shape, A.dtype, device=A.device)
-
-
-def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]):
-    """Chebyshev iterative solver using SYCL routines"""
-    d = (lMax + lMin) / 2
-    c = (lMax - lMin) / 2
-
-    x = dpt.copy(x0)
-    exec_queue = A.sycl_queue
-    assert exec_queue == x.sycl_queue
-    Ax = empty_like(A[:, 0])
-    r = empty_like(Ax)
-    p = empty_like(Ax)
-
-    e_x = dpctl.SyclEvent()
-    he_dot, e_dot = sycl_gemm.gemv(
-        exec_queue, A, x, Ax, depends=depends
-    )  # Ax = A @ x
-    he_sub, e_sub = sycl_gemm.sub(
-        exec_queue, b, Ax, r, depends=[e_dot]
-    )  # r = b - Ax
-    r_ev = e_sub
-    for i in range(nIters):
-        z = r
-        z_ev = r_ev
-        if i == 0:
-            p[:] = z
-            alpha = 1 / d
-            he_axbpy, e_axbpy = dpctl.SyclEvent(), dpctl.SyclEvent()
-        elif i == 1:
-            beta = 0.5 * (c * alpha) ** 2
-            alpha = 1 / (d - beta / alpha)
-            he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace(
-                exec_queue, 1, z, beta, p, depends=[z_ev]
-            )  # p = z + beta * p
-        else:
-            beta = (c / 2 * alpha) ** 2
-            alpha = 1 / (d - beta / alpha)
-            he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace(
-                exec_queue, 1, z, beta, p, depends=[z_ev]
-            )  # p = z + beta * p
-        h_x, e_x = sycl_gemm.axbpy_inplace(
-            exec_queue, alpha, p, 1, x, depends=[e_axbpy, e_x]
-        )  # x = x + alpha * p
-        he_dot, e_dot = sycl_gemm.gemv(
-            exec_queue, A, x, Ax, depends=[e_x]
-        )  # Ax = A @ x
-        he_sub, e_sub = sycl_gemm.sub(
-            exec_queue, b, Ax, r, depends=[e_dot]
-        )  # r = b - Ax
-        residual = sycl_gemm.norm_squared_blocking(
-            exec_queue, r, depends=[e_sub]
-        )  # residual = dot(r, r)
-        if residual <= 1e-29:
-            print(f"chebyshev: converged in {i} iters")
-            break
-    return x
-
-
-def check_with_numpy(A, b):
-    """Direct solver using numpy"""
-    import numpy as np
-
-    Anp = dpt.asnumpy(A)
-    bnp = dpt.asnumpy(b)
-    return np.linalg.solve(Anp, bnp)
-
-
-def chebyshev_numpy(A, b, x0, nIters, lMax, lMin):
-    """Chebyshev iterative solver using numpy"""
-    d = (lMax + lMin) / 2
-    c = (lMax - lMin) / 2
-
-    x = x0
-
-    Ax = A @ x
-    r = b - Ax
-    for i in range(nIters):
-        z = r
-        if i == 0:
-            p = z
-            alpha = 1 / d
-        elif i == 1:
-            beta = 0.5 * (c * alpha) ** 2
-            alpha = 1 / (d - beta / alpha)
-            p = z + beta * p
-        else:
-            beta = (c / 2 * alpha) ** 2
-            alpha = 1 / (d - beta / alpha)
-            p = z + beta * p
-        x += alpha * p
-        Ax = A @ x
-        r = b - Ax
-        residual = np.dot(r, r)
-        if residual <= 1e-29:
-            print(f"chebyshev_numpy: converged in {i} iters")
-            break
-    return x
-
-
-if __name__ == "__main__":
-    A = dpt.asarray(
-        [[2, 1, 0, 0], [1, 2, 1, 0], [0, 1, 2, 1], [0, 0, 1, 2]],
-        dtype="d",
-        usm_type="device",
-    )
-    dev = A.device
-    b = dpt.asarray(
-        [0.5, 0.7, 0.3, 0.1], dtype="d", usm_type="device", device=dev
-    )
-    x0 = b
-    x = chebyshev(A, b, x0, 100, 4.0, 0.25)
-    print(dpt.asnumpy(x))
-    print(check_with_numpy(A, b))
-    print(
-        chebyshev_numpy(
-            dpt.asnumpy(A), dpt.asnumpy(b), dpt.asnumpy(b), 100, 4.0, 0.25
-        )
-    )
diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py
new file mode 100644
index 0000000000..1340556770
--- /dev/null
+++ b/examples/pybind11/onemkl_gemv/solve.py
@@ -0,0 +1,229 @@
+import numpy as np
+import sycl_gemm
+
+import dpctl
+import dpctl.tensor as dpt
+
+
+def empty_like(A):
+    return dpt.empty(A.shape, A.dtype, device=A.device)
+
+
+def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]):
+    """Chebyshev iterative solver using SYCL routines"""
+    d = (lMax + lMin) / 2
+    c = (lMax - lMin) / 2
+
+    x = dpt.copy(x0)
+    exec_queue = A.sycl_queue
+    assert exec_queue == x.sycl_queue
+    Ax = empty_like(A[:, 0])
+    r = empty_like(Ax)
+    p = empty_like(Ax)
+
+    e_x = dpctl.SyclEvent()
+    he_dot, e_dot = sycl_gemm.gemv(
+        exec_queue, A, x, Ax, depends=depends
+    )  # Ax = A @ x
+    he_sub, e_sub = sycl_gemm.sub(
+        exec_queue, b, Ax, r, depends=[e_dot]
+    )  # r = b - Ax
+    r_ev = e_sub
+    for i in range(nIters):
+        z = r
+        z_ev = r_ev
+        if i == 0:
+            p[:] = z
+            alpha = 1 / d
+            he_axbpy, e_axbpy = dpctl.SyclEvent(), dpctl.SyclEvent()
+        elif i == 1:
+            beta = 0.5 * (c * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace(
+                exec_queue, 1, z, beta, p, depends=[z_ev]
+            )  # p = z + beta * p
+        else:
+            beta = (c / 2 * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace(
+                exec_queue, 1, z, beta, p, depends=[z_ev]
+            )  # p = z + beta * p
+        h_x, e_x = sycl_gemm.axbpy_inplace(
+            exec_queue, alpha, p, 1, x, depends=[e_axbpy, e_x]
+        )  # x = x + alpha * p
+        he_dot, e_dot = sycl_gemm.gemv(
+            exec_queue, A, x, Ax, depends=[e_x]
+        )  # Ax = A @ x
+        he_sub, e_sub = sycl_gemm.sub(
+            exec_queue, b, Ax, r, depends=[e_dot]
+        )  # r = b - Ax
+        residual = sycl_gemm.norm_squared_blocking(
+            exec_queue, r, depends=[e_sub]
+        )  # residual = dot(r, r)
+        if residual <= 1e-29:
+            print(f"chebyshev: converged in {i} iters")
+            break
+    exec_queue.wait()  # wait for all host tasks to complete
+    return x
+
+
+def check_with_numpy(A, b):
+    """Direct solver using numpy"""
+    import numpy as np
+
+    return np.linalg.solve(Anp, bnp)
+
+
+def chebyshev_numpy(A, b, x0, nIters, lMax, lMin):
+    """Chebyshev iterative solver using numpy"""
+    d = (lMax + lMin) / 2
+    c = (lMax - lMin) / 2
+
+    x = x0
+
+    Ax = np.dot(A, x)
+    r = b - Ax
+    for i in range(nIters):
+        z = r
+        if i == 0:
+            p = z
+            alpha = 1 / d
+        elif i == 1:
+            beta = 0.5 * (c * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            p = z + beta * p
+        else:
+            beta = (c / 2 * alpha) ** 2
+            alpha = 1 / (d - beta / alpha)
+            p = z + beta * p
+        x = x + alpha * p
+        Ax = np.dot(A, x)
+        r = b - Ax
+        residual = np.dot(r, r)
+        if residual <= 1e-29:
+            print(f"chebyshev_numpy: converged in {i} iters")
+            break
+    return x
+
+
+def cg_solve(A, b):
+    """
+    Conjugate gradient solver for A @ x == b.
+
+    Returns tuple: (x, converged)
+
+    converged is False if solver has not converged, or the iteration number
+    """
+    exec_queue = A.sycl_queue
+    x = dpt.zeros(b.shape, dtype=b.dtype)
+    Ap = empty_like(x)
+
+    all_host_tasks = []
+    r = dpt.copy(b)
+    p = dpt.copy(b)
+    rsold = sycl_gemm.norm_squared_blocking(exec_queue, r)
+    if rsold < 1e-20:
+        return (b, 0)
+    converged = False
+    max_iters = b.shape[0]
+
+    e_p = dpctl.SyclEvent()
+    e_x = dpctl.SyclEvent()
+    for i in range(max_iters):
+        he_dot, e_dot = sycl_gemm.gemv(
+            exec_queue, A, p, Ap, depends=[e_p]
+        )  # Ap = A @ p
+        all_host_tasks.append(he_dot)
+        alpha = rsold / sycl_gemm.dot_blocking(  # alpha = rsold / dot(p, Ap)
+            exec_queue, p, Ap, depends=[e_dot]
+        )
+        he1_axbpy, e1_axbpy = sycl_gemm.axbpy_inplace(
+            exec_queue, alpha, p, 1, x, depends=[e_p, e_x]
+        )  # x = x + alpha * p
+        all_host_tasks.append(he1_axbpy)
+        e_x = e1_axbpy
+
+        he2_axbpy, e2_axbpy = sycl_gemm.axbpy_inplace(
+            exec_queue, -alpha, Ap, 1, r, depends=[e_p]
+        )  # r = r - alpha * Ap
+        all_host_tasks.append(he2_axbpy)
+
+        rsnew = sycl_gemm.norm_squared_blocking(
+            exec_queue, r, depends=[e2_axbpy]
+        )
+        if rsnew < 1e-20:
+            e1_axbpy.wait()
+            converged = i
+            break
+        beta = rsnew / rsold
+
+        he3_axbpy, e3_axbpy = sycl_gemm.axbpy_inplace(
+            exec_queue, 1, r, beta, p, depends=[e1_axbpy, e2_axbpy]
+        )  # p = r + beta * p
+
+        rsold = rsnew
+        all_host_tasks.append(he3_axbpy)
+        e_p = e3_axbpy
+
+    dpctl.SyclEvent.wait_for(all_host_tasks)
+    return x, converged
+
+
+def cg_solve_numpy(A, b):
+    x = np.zeros_like(b)
+    r = b - np.dot(A, x)
+    p = r
+    rsold = np.dot(r, r)
+    converged = False
+    max_iters = b.shape[0]
+
+    for i in range(max_iters):
+        Ap = np.dot(A, p)
+        alpha = rsold / np.dot(p, Ap)
+        x = x + alpha * p
+        r = r - alpha * Ap
+        rsnew = np.dot(r, r)
+
+        if rsnew < 1e-20:
+            converged = i
+            break
+
+        beta = rsnew / rsold
+        p = r + beta * p
+        rsold = rsnew
+
+    return (x, converged)
+
+
+if __name__ == "__main__":
+    n = 32
+    lambda_max = 4
+    lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2))))
+    # eigenvalues of cartan matrix are
+    Anp = (
+        2 * np.eye(n, n, k=0, dtype="d")
+        + np.eye(n, n, k=1, dtype="d")
+        + np.eye(n, n, k=-1, dtype="d")
+    )
+    bnp = np.geomspace(0.5, 2, n, dtype="d")
+
+    q = dpctl.SyclQueue(property="enable_profiling")
+    A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q)
+    dev = A.device
+    b = dpt.asarray(bnp, dtype="d", usm_type="device", device=dev)
+    x0 = b
+    t = dpctl.SyclTimer()
+    with t(dev.sycl_queue):
+        x, conv = cg_solve(A, b)
+    print((conv, t.dt))
+    with t(dev.sycl_queue):
+        x, conv = cg_solve(A, b)
+    print(dpt.asnumpy(x))
+    print((conv, t.dt))
+    with t(dev.sycl_queue):
+        print(check_with_numpy(Anp, bnp))
+    print(t.dt)
+    with t(dev.sycl_queue):
+        x_np, conv = cg_solve_numpy(dpt.asnumpy(A), dpt.asnumpy(b))
+    print(x_np, conv)
+    print(t.dt)
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
index df8b5129fa..14a8c0dec8 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
@@ -1,3 +1,15 @@
-from ._onemkl import axbpy_inplace, gemv, norm_squared_blocking, sub
+from ._onemkl import (
+    axbpy_inplace,
+    dot_blocking,
+    gemv,
+    norm_squared_blocking,
+    sub,
+)
 
-__all__ = ["gemv", "sub", "axbpy_inplace", "norm_squared_blocking"]
+__all__ = [
+    "gemv",
+    "sub",
+    "axbpy_inplace",
+    "norm_squared_blocking",
+    "dot_blocking",
+]
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index 1fb7a951d6..05590565e2 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -343,6 +343,38 @@ T norm_squared_blocking_impl(sycl::queue q,
     return ha[0];
 }
 
+template <typename T> class complex_norm_squared_blocking_kern;
+
+template <typename T>
+T complex_norm_squared_blocking_impl(
+    sycl::queue q,
+    size_t nelems,
+    const char *r_typeless,
+    const std::vector<sycl::event> &depends = {})
+{
+    const std::complex<T> *r =
+        reinterpret_cast<const std::complex<T> *>(r_typeless);
+
+    sycl::buffer<T, 1> sum_sq_buf(sycl::range<1>{1});
+
+    q.submit([&](sycl::handler &cgh) {
+         cgh.depends_on(depends);
+         auto sum_sq_reduction = sycl::reduction(
+             sum_sq_buf, cgh, sycl::plus<T>(),
+             {sycl::property::reduction::initialize_to_identity{}});
+         cgh.parallel_for<complex_norm_squared_blocking_kern<T>>(
+             sycl::range<1>{nelems}, sum_sq_reduction,
+             [=](sycl::id<1> id, auto &sum_sq) {
+                 auto i = id.get(0);
+                 sum_sq +=
+                     r[i].real() * r[i].real() + r[i].imag() * r[i].imag();
+             });
+     }).wait_and_throw();
+
+    sycl::host_accessor ha(sum_sq_buf);
+    return ha[0];
+}
+
 py::object norm_squared_blocking(sycl::queue q,
                                  dpctl::tensor::usm_ndarray r,
                                  const std::vector<sycl::event> depends = {})
@@ -380,22 +412,105 @@ py::object norm_squared_blocking(sycl::queue q,
         T n_sq = norm_squared_blocking_impl<T>(q, n, r_typeless_ptr, depends);
         res = py::float_(n_sq);
     }
-#if 0
     else if (r_typenum == UAR_CDOUBLE) {
         using T = std::complex<double>;
-        double n_sq = norm_squared_blocking_impl<T>(
-	    q, n, r_typeless_ptr,
-	    depends);
-	res = py::float_(n_sq);
+        double n_sq = complex_norm_squared_blocking_impl<double>(
+            q, n, r_typeless_ptr, depends);
+        res = py::float_(n_sq);
     }
     else if (r_typenum == UAR_CFLOAT) {
         using T = std::complex<float>;
-        float n_sq = norm_squared_blocking_impl<T>(
-	    q, n, r_typeless_ptr,
-	    depends);
-	res = py::float_(n_sq);
+        float n_sq = complex_norm_squared_blocking_impl<float>(
+            q, n, r_typeless_ptr, depends);
+        res = py::float_(n_sq);
+    }
+    else {
+        throw std::runtime_error("Type dispatch ran into trouble.");
+    }
+
+    return res;
+}
+
+py::object dot_blocking(sycl::queue q,
+                        dpctl::tensor::usm_ndarray v1,
+                        dpctl::tensor::usm_ndarray v2,
+                        const std::vector<sycl::event> &depends = {})
+{
+    if (v1.get_ndim() != 1 || v2.get_ndim() != 1) {
+        throw std::runtime_error("Expecting two vectors");
+    }
+
+    py::ssize_t n = v1.get_shape(0); // get length of the vector
+
+    if (n != v2.get_shape(0)) {
+        throw std::runtime_error("Length of vectors are not the same");
+    }
+
+    int v1_flags = v1.get_flags();
+    int v2_flags = v2.get_flags();
+
+    if (!(v1_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) ||
+        !(v2_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)))
+    {
+        throw std::runtime_error("Vectors must be contiguous.");
+    }
+
+    int v1_typenum = v1.get_typenum();
+    int v2_typenum = v2.get_typenum();
+
+    if ((v1_typenum != v2_typenum) ||
+        ((v1_typenum != UAR_DOUBLE) && (v1_typenum != UAR_FLOAT) &&
+         (v1_typenum != UAR_CDOUBLE) && (v1_typenum != UAR_CFLOAT)))
+    {
+        throw py::value_error(
+            "Data types of vectors must be the same. "
+            "Only real and complex floating types are supported.");
+    }
+
+    const char *v1_typeless_ptr = v1.get_data();
+    const char *v2_typeless_ptr = v2.get_data();
+    py::object res;
+
+    if (v1_typenum == UAR_DOUBLE) {
+        using T = double;
+        T *res_usm = sycl::malloc_device<T>(1, q);
+        sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
+            q, n, reinterpret_cast<const T *>(v1_typeless_ptr), 1,
+            reinterpret_cast<const T *>(v2_typeless_ptr), 1, res_usm, depends);
+        T res_v{};
+        q.copy<T>(res_usm, &res_v, 1, {dot_ev}).wait_and_throw();
+        res = py::float_(res_v);
+    }
+    else if (v1_typenum == UAR_FLOAT) {
+        using T = float;
+        T *res_usm = sycl::malloc_device<T>(1, q);
+        sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
+            q, n, reinterpret_cast<const T *>(v1_typeless_ptr), 1,
+            reinterpret_cast<const T *>(v2_typeless_ptr), 1, res_usm, depends);
+        T res_v(0);
+        q.copy<T>(res_usm, &res_v, 1, {dot_ev}).wait_and_throw();
+        res = py::float_(res_v);
+    }
+    else if (v1_typenum == UAR_CDOUBLE) {
+        using T = std::complex<double>;
+        T *res_usm = sycl::malloc_device<T>(1, q);
+        sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc(
+            q, n, reinterpret_cast<const T *>(v1_typeless_ptr), 1,
+            reinterpret_cast<const T *>(v2_typeless_ptr), 1, res_usm, depends);
+        T res_v{};
+        q.copy<T>(res_usm, &res_v, 1, {dotc_ev}).wait_and_throw();
+        res = py::cast(res_v);
+    }
+    else if (v1_typenum == UAR_CFLOAT) {
+        using T = std::complex<float>;
+        T *res_usm = sycl::malloc_device<T>(1, q);
+        sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc(
+            q, n, reinterpret_cast<const T *>(v1_typeless_ptr), 1,
+            reinterpret_cast<const T *>(v2_typeless_ptr), 1, res_usm, depends);
+        T res_v{};
+        q.copy<T>(res_usm, &res_v, 1, {dotc_ev}).wait_and_throw();
+        res = py::cast(res_v);
     }
-#endif
     else {
         throw std::runtime_error("Type dispatch ran into trouble.");
     }
@@ -418,4 +533,6 @@ PYBIND11_MODULE(_onemkl, m)
           py::arg("y"), py::arg("depends") = py::list());
     m.def("norm_squared_blocking", &norm_squared_blocking, "norm(r)**2",
           py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list());
+    m.def("dot_blocking", &dot_blocking, "<v1, v2>", py::arg("exec_queue"),
+          py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list());
 }

From 49cee5cccb0d2f1d2050edaea260d60ceddd90e2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 25 Apr 2022 10:51:20 -0500
Subject: [PATCH 03/11] Renamed function: axpby for a*x plus b*y. Added tests
 for each function.

---
 examples/pybind11/onemkl_gemv/solve.py        | 32 +++++-----
 .../onemkl_gemv/sycl_gemm/__init__.py         |  4 +-
 .../onemkl_gemv/sycl_gemm/_onemkl.cpp         | 18 +++---
 .../pybind11/onemkl_gemv/tests/test_gemm.py   | 62 ++++++++++++++++++-
 4 files changed, 87 insertions(+), 29 deletions(-)

diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py
index 1340556770..b12681ab35 100644
--- a/examples/pybind11/onemkl_gemv/solve.py
+++ b/examples/pybind11/onemkl_gemv/solve.py
@@ -35,21 +35,21 @@ def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]):
         if i == 0:
             p[:] = z
             alpha = 1 / d
-            he_axbpy, e_axbpy = dpctl.SyclEvent(), dpctl.SyclEvent()
+            he_axpby, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent()
         elif i == 1:
             beta = 0.5 * (c * alpha) ** 2
             alpha = 1 / (d - beta / alpha)
-            he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace(
+            he_axpby, e_axpby = sycl_gemm.axpby_inplace(
                 exec_queue, 1, z, beta, p, depends=[z_ev]
             )  # p = z + beta * p
         else:
             beta = (c / 2 * alpha) ** 2
             alpha = 1 / (d - beta / alpha)
-            he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace(
+            he_axpby, e_axpby = sycl_gemm.axpby_inplace(
                 exec_queue, 1, z, beta, p, depends=[z_ev]
             )  # p = z + beta * p
-        h_x, e_x = sycl_gemm.axbpy_inplace(
-            exec_queue, alpha, p, 1, x, depends=[e_axbpy, e_x]
+        h_x, e_x = sycl_gemm.axpby_inplace(
+            exec_queue, alpha, p, 1, x, depends=[e_axpby, e_x]
         )  # x = x + alpha * p
         he_dot, e_dot = sycl_gemm.gemv(
             exec_queue, A, x, Ax, depends=[e_x]
@@ -137,33 +137,33 @@ def cg_solve(A, b):
         alpha = rsold / sycl_gemm.dot_blocking(  # alpha = rsold / dot(p, Ap)
             exec_queue, p, Ap, depends=[e_dot]
         )
-        he1_axbpy, e1_axbpy = sycl_gemm.axbpy_inplace(
+        he1_axpby, e1_axpby = sycl_gemm.axpby_inplace(
             exec_queue, alpha, p, 1, x, depends=[e_p, e_x]
         )  # x = x + alpha * p
-        all_host_tasks.append(he1_axbpy)
-        e_x = e1_axbpy
+        all_host_tasks.append(he1_axpby)
+        e_x = e1_axpby
 
-        he2_axbpy, e2_axbpy = sycl_gemm.axbpy_inplace(
+        he2_axpby, e2_axpby = sycl_gemm.axpby_inplace(
             exec_queue, -alpha, Ap, 1, r, depends=[e_p]
         )  # r = r - alpha * Ap
-        all_host_tasks.append(he2_axbpy)
+        all_host_tasks.append(he2_axpby)
 
         rsnew = sycl_gemm.norm_squared_blocking(
-            exec_queue, r, depends=[e2_axbpy]
+            exec_queue, r, depends=[e2_axpby]
         )
         if rsnew < 1e-20:
-            e1_axbpy.wait()
+            e1_axpby.wait()
             converged = i
             break
         beta = rsnew / rsold
 
-        he3_axbpy, e3_axbpy = sycl_gemm.axbpy_inplace(
-            exec_queue, 1, r, beta, p, depends=[e1_axbpy, e2_axbpy]
+        he3_axpby, e3_axpby = sycl_gemm.axpby_inplace(
+            exec_queue, 1, r, beta, p, depends=[e1_axpby, e2_axpby]
         )  # p = r + beta * p
 
         rsold = rsnew
-        all_host_tasks.append(he3_axbpy)
-        e_p = e3_axbpy
+        all_host_tasks.append(he3_axpby)
+        e_p = e3_axpby
 
     dpctl.SyclEvent.wait_for(all_host_tasks)
     return x, converged
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
index 14a8c0dec8..45af05e21b 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
@@ -1,5 +1,5 @@
 from ._onemkl import (
-    axbpy_inplace,
+    axpby_inplace,
     dot_blocking,
     gemv,
     norm_squared_blocking,
@@ -9,7 +9,7 @@
 __all__ = [
     "gemv",
     "sub",
-    "axbpy_inplace",
+    "axpby_inplace",
     "norm_squared_blocking",
     "dot_blocking",
 ]
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index 05590565e2..84d33bc91e 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -212,10 +212,10 @@ sub(sycl::queue q,
     return std::make_pair(ht_event, res_ev);
 }
 
-template <typename T> class axbpy_inplace_kern;
+template <typename T> class axpby_inplace_kern;
 
 template <typename T>
-sycl::event axbpy_inplace_impl(sycl::queue q,
+sycl::event axpby_inplace_impl(sycl::queue q,
                                size_t nelems,
                                py::object pyobj_a,
                                const char *x_typeless,
@@ -231,7 +231,7 @@ sycl::event axbpy_inplace_impl(sycl::queue q,
 
     sycl::event res_ev = q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
-        cgh.parallel_for<axbpy_inplace_kern<T>>(sycl::range<1>{nelems},
+        cgh.parallel_for<axpby_inplace_kern<T>>(sycl::range<1>{nelems},
                                                 [=](sycl::id<1> id) {
                                                     auto i = id.get(0);
                                                     y[i] = a * x[i] + b * y[i];
@@ -243,7 +243,7 @@ sycl::event axbpy_inplace_impl(sycl::queue q,
 
 // y = a * x + b * y
 std::pair<sycl::event, sycl::event>
-axbpy_inplace(sycl::queue q,
+axpby_inplace(sycl::queue q,
               py::object a,
               dpctl::tensor::usm_ndarray x,
               py::object b,
@@ -287,22 +287,22 @@ axbpy_inplace(sycl::queue q,
     sycl::event res_ev;
     if (x_typenum == UAR_DOUBLE) {
         using T = double;
-        res_ev = axbpy_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+        res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
                                        y_typeless_ptr, depends);
     }
     else if (x_typenum == UAR_FLOAT) {
         using T = float;
-        res_ev = axbpy_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+        res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
                                        y_typeless_ptr, depends);
     }
     else if (x_typenum == UAR_CDOUBLE) {
         using T = std::complex<double>;
-        res_ev = axbpy_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+        res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
                                        y_typeless_ptr, depends);
     }
     else if (x_typenum == UAR_CFLOAT) {
         using T = std::complex<float>;
-        res_ev = axbpy_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
+        res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
                                        y_typeless_ptr, depends);
     }
     else {
@@ -528,7 +528,7 @@ PYBIND11_MODULE(_onemkl, m)
     m.def("sub", &sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"),
           py::arg("in1"), py::arg("in2"), py::arg("out"),
           py::arg("depends") = py::list());
-    m.def("axbpy_inplace", &axbpy_inplace, "y = a * x + b * y",
+    m.def("axpby_inplace", &axpby_inplace, "y = a * x + b * y",
           py::arg("exec_queue"), py::arg("a"), py::arg("x"), py::arg("b"),
           py::arg("y"), py::arg("depends") = py::list());
     m.def("norm_squared_blocking", &norm_squared_blocking, "norm(r)**2",
diff --git a/examples/pybind11/onemkl_gemv/tests/test_gemm.py b/examples/pybind11/onemkl_gemv/tests/test_gemm.py
index 7a78fba346..6090322c75 100644
--- a/examples/pybind11/onemkl_gemv/tests/test_gemm.py
+++ b/examples/pybind11/onemkl_gemv/tests/test_gemm.py
@@ -1,6 +1,12 @@
 import numpy as np
 import pytest
-from sycl_gemm import gemv
+from sycl_gemm import (
+    axpby_inplace,
+    dot_blocking,
+    gemv,
+    norm_squared_blocking,
+    sub,
+)
 
 import dpctl
 import dpctl.tensor as dpt
@@ -15,7 +21,59 @@ def test_gemv():
     r = dpt.empty((5,), dtype="d", sycl_queue=q)
     M = dpt.asarray(Mnp, sycl_queue=q)
     v = dpt.asarray(vnp, sycl_queue=q)
-    hev, ev = gemv(M.sycl_queue, M, v, r, [])
+    hev, ev = gemv(q, M, v, r, [])
     hev.wait()
     rnp = dpt.asnumpy(r)
     assert np.allclose(rnp, Mnp @ vnp)
+
+
+def test_sub():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    anp, bnp = np.random.randn(5), np.random.randn(5)
+    r = dpt.empty((5,), dtype="d", sycl_queue=q)
+    a = dpt.asarray(anp, sycl_queue=q)
+    b = dpt.asarray(bnp, sycl_queue=q)
+    hev, ev = sub(q, a, b, r, [])
+    hev.wait()
+    rnp = dpt.asnumpy(r)
+    assert np.allclose(rnp + bnp, anp)
+
+
+def test_axpby():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    xnp, pnp = np.random.randn(5), np.random.randn(5)
+    x = dpt.asarray(xnp, sycl_queue=q)
+    p = dpt.asarray(pnp, sycl_queue=q)
+    hev, ev = axpby_inplace(q, 0.5, x, -0.7, p, [])
+    hev.wait()
+    rnp = dpt.asnumpy(p)
+    assert np.allclose(rnp, 0.5 * xnp - 0.7 * pnp)
+
+
+def test_dot():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    anp, bnp = np.random.randn(5), np.random.randn(5)
+    a = dpt.asarray(anp, sycl_queue=q)
+    b = dpt.asarray(bnp, sycl_queue=q)
+    dot_res = dot_blocking(q, a, b)
+    assert np.allclose(dot_res, np.dot(anp, bnp))
+
+
+def test_norm_squared():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    anp = np.random.randn(5)
+    a = dpt.asarray(anp, sycl_queue=q)
+    dot_res = norm_squared_blocking(q, a)
+    assert np.allclose(dot_res, np.dot(anp, anp))

From 06db2583c8640fbf4d0ea0ea1e1845932029d526 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 25 Apr 2022 13:01:18 -0500
Subject: [PATCH 04/11] Used scikit-build for all pybind11 examples, added
 tests to all.

---
 .../external_usm_allocation/CMakeLists.txt    | 36 +++++++++++++++++
 .../external_usm_allocation/README.md         |  3 +-
 .../external_usm_allocation/example.py        |  2 +-
 .../external_usm_allocation/__init__.py       |  3 ++
 .../_usm_alloc_example.cpp                    |  2 +-
 .../pybind11/external_usm_allocation/setup.py | 26 +++++--------
 .../tests/test_dmatrix.py                     | 30 ++++++++++++++
 .../use_dpctl_syclqueue/CMakeLists.txt        | 36 +++++++++++++++++
 .../pybind11/use_dpctl_syclqueue/README.md    |  3 +-
 .../pybind11/use_dpctl_syclqueue/example.py   |  2 +-
 .../pybind11/use_dpctl_syclqueue/setup.py     | 25 +++++-------
 .../tests/test_queue_device.py                | 39 +++++++++++++++++++
 .../use_queue_device/__init__.py              | 13 +++++++
 .../{ => use_queue_device}/_example.cpp       |  2 +-
 14 files changed, 183 insertions(+), 39 deletions(-)
 create mode 100644 examples/pybind11/external_usm_allocation/CMakeLists.txt
 create mode 100644 examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py
 rename examples/pybind11/external_usm_allocation/{ => external_usm_allocation}/_usm_alloc_example.cpp (99%)
 create mode 100644 examples/pybind11/external_usm_allocation/tests/test_dmatrix.py
 create mode 100644 examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt
 create mode 100644 examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py
 create mode 100644 examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py
 rename examples/pybind11/use_dpctl_syclqueue/{ => use_queue_device}/_example.cpp (98%)

diff --git a/examples/pybind11/external_usm_allocation/CMakeLists.txt b/examples/pybind11/external_usm_allocation/CMakeLists.txt
new file mode 100644
index 0000000000..6f42a3e6d3
--- /dev/null
+++ b/examples/pybind11/external_usm_allocation/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(external_usm_allocation LANGUAGES CXX)
+
+set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
+find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Fetch pybind11
+include(FetchContent)
+FetchContent_Declare(
+  pybind11
+  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz
+  URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1
+)
+FetchContent_MakeAvailable(pybind11)
+
+find_package(PythonExtensions REQUIRED)
+find_package(Dpctl REQUIRED)
+find_package(NumPy REQUIRED)
+
+set(py_module_name _external_usm_alloc)
+pybind11_add_module(${py_module_name}
+    MODULE
+    external_usm_allocation/_usm_alloc_example.cpp
+)
+target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
+target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations)
+install(TARGETS ${py_module_name}
+  DESTINATION external_usm_allocation
+)
+
+set(ignoreMe "${SKBUILD}")
diff --git a/examples/pybind11/external_usm_allocation/README.md b/examples/pybind11/external_usm_allocation/README.md
index 7431ba4ab6..0b8dd62dc2 100644
--- a/examples/pybind11/external_usm_allocation/README.md
+++ b/examples/pybind11/external_usm_allocation/README.md
@@ -10,6 +10,7 @@ to dpctl.memory entities using `__sycl_usm_array_interface__`.
 ```
 source /opt/intel/oneapi/compiler/latest/env/vars.sh
 CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace
+python -m pytest tests
 python example.py
 ```
 
@@ -17,7 +18,7 @@ python example.py
 
 ```
 (idp) [12:43:20 ansatnuc04 external_usm_allocation]$ python example.py
-<external_usm_alloc.DMatrix object at 0x7f2b98b4cef0>
+<external_usm_allocation._external_usm_alloc.DMatrix object at 0x7f2b98b4cef0>
 {'data': [94846745444352, True], 'shape': (5, 5), 'strides': None, 'version': 1, 'typestr': '|f8', 'syclobj': <capsule object "SyclQueueRef" at 0x7f2b9b941d80>}
 shared
 
diff --git a/examples/pybind11/external_usm_allocation/example.py b/examples/pybind11/external_usm_allocation/example.py
index 4455616146..6001d87eca 100644
--- a/examples/pybind11/external_usm_allocation/example.py
+++ b/examples/pybind11/external_usm_allocation/example.py
@@ -16,7 +16,7 @@
 
 # coding: utf-8
 
-import external_usm_alloc as eua
+import external_usm_allocation as eua
 import numpy as np
 
 import dpctl
diff --git a/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py
new file mode 100644
index 0000000000..c4a46c18e6
--- /dev/null
+++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py
@@ -0,0 +1,3 @@
+from ._external_usm_alloc import DMatrix
+
+__all__ = ["DMatrix"]
diff --git a/examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp
similarity index 99%
rename from examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp
rename to examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp
index 30216dac55..2b8212076b 100644
--- a/examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp
+++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp
@@ -132,7 +132,7 @@ py::list tolist(DMatrix &m)
     return rows;
 }
 
-PYBIND11_MODULE(external_usm_alloc, m)
+PYBIND11_MODULE(_external_usm_alloc, m)
 {
     // Import the dpctl extensions
     import_dpctl();
diff --git a/examples/pybind11/external_usm_allocation/setup.py b/examples/pybind11/external_usm_allocation/setup.py
index 2802075d41..86abd2854a 100644
--- a/examples/pybind11/external_usm_allocation/setup.py
+++ b/examples/pybind11/external_usm_allocation/setup.py
@@ -14,21 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pybind11.setup_helpers import Pybind11Extension
-from setuptools import setup
+from skbuild import setup
 
-import dpctl
-
-ext_modules = [
-    Pybind11Extension(
-        "external_usm_alloc",
-        ["./_usm_alloc_example.cpp"],
-        include_dirs=[dpctl.get_include()],
-        extra_compile_args=["-fPIC"],
-        extra_link_args=["-fPIC"],
-        libraries=["sycl"],
-        language="c++",
-    )
-]
-
-setup(name="external_usm_alloc", ext_modules=ext_modules)
+setup(
+    name="external_usm_allocation",
+    version="0.0.1",
+    description="an example of SYCL-powered Python package (with pybind11)",
+    author="Intel Scripting",
+    license="Apache 2.0",
+    packages=["external_usm_allocation"],
+)
diff --git a/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py
new file mode 100644
index 0000000000..0afe80eb12
--- /dev/null
+++ b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py
@@ -0,0 +1,30 @@
+import external_usm_allocation as eua
+import numpy as np
+
+import dpctl
+import dpctl.memory as dpm
+
+
+def test_dmatrix():
+    q = dpctl.SyclQueue()
+    matr = eua.DMatrix(q, 5, 5)
+    assert hasattr(matr, "__sycl_usm_array_interface__")
+
+    blob = dpm.as_usm_memory(matr)
+    assert blob.get_usm_type() == "shared"
+
+    Xh = np.array(
+        [
+            [1, 1, 1, 2, 2],
+            [1, 0, 1, 2, 2],
+            [1, 1, 0, 2, 2],
+            [0, 0, 0, 3, -1],
+            [0, 0, 0, -1, 5],
+        ],
+        dtype="d",
+    )
+    host_bytes_view = Xh.reshape((-1)).view(np.ubyte)
+    blob.copy_from_host(host_bytes_view)
+
+    list_of_lists = matr.tolist()
+    assert list_of_lists == Xh.tolist()
diff --git a/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt b/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt
new file mode 100644
index 0000000000..a788c56bae
--- /dev/null
+++ b/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt
@@ -0,0 +1,36 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(use_queue_device LANGUAGES CXX)
+
+set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
+find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Fetch pybind11
+include(FetchContent)
+FetchContent_Declare(
+  pybind11
+  URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz
+  URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1
+)
+FetchContent_MakeAvailable(pybind11)
+
+find_package(PythonExtensions REQUIRED)
+find_package(Dpctl REQUIRED)
+find_package(NumPy REQUIRED)
+
+set(py_module_name _use_queue_device)
+pybind11_add_module(${py_module_name}
+    MODULE
+    use_queue_device/_example.cpp
+)
+target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
+target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations)
+install(TARGETS ${py_module_name}
+  DESTINATION use_queue_device
+)
+
+set(ignoreMe "${SKBUILD}")
diff --git a/examples/pybind11/use_dpctl_syclqueue/README.md b/examples/pybind11/use_dpctl_syclqueue/README.md
index 3e7b6804a9..f3ef6e92e5 100644
--- a/examples/pybind11/use_dpctl_syclqueue/README.md
+++ b/examples/pybind11/use_dpctl_syclqueue/README.md
@@ -9,7 +9,8 @@ extensions.
 
 ```
 source /opt/intel/oneapi/compiler/latest/env/vars.sh
-CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace
+CXX=icpx python setup.py build_ext --inplace
+python -m pytest tests
 python example.py
 ```
 
diff --git a/examples/pybind11/use_dpctl_syclqueue/example.py b/examples/pybind11/use_dpctl_syclqueue/example.py
index 7189adecb8..698e43baf9 100644
--- a/examples/pybind11/use_dpctl_syclqueue/example.py
+++ b/examples/pybind11/use_dpctl_syclqueue/example.py
@@ -17,7 +17,7 @@
 # coding: utf-8
 
 import numpy as np
-import use_queue_device_ext as eg
+import use_queue_device as eg
 
 import dpctl
 
diff --git a/examples/pybind11/use_dpctl_syclqueue/setup.py b/examples/pybind11/use_dpctl_syclqueue/setup.py
index 34eeebe38c..4d09be820e 100644
--- a/examples/pybind11/use_dpctl_syclqueue/setup.py
+++ b/examples/pybind11/use_dpctl_syclqueue/setup.py
@@ -14,20 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pybind11.setup_helpers import Pybind11Extension
-from setuptools import setup
+from skbuild import setup
 
-import dpctl
-
-exts = [
-    Pybind11Extension(
-        "use_queue_device_ext",
-        ["./_example.cpp"],
-        include_dirs=[dpctl.get_include()],
-        extra_compile_args=["-fPIC"],
-        extra_link_args=["-fPIC"],
-        language="c++",
-    ),
-]
-
-setup(name="pybind11_example", ext_modules=exts)
+setup(
+    name="use_queue_device",
+    version="0.0.1",
+    description="an example of SYCL-powered Python package (with pybind11)",
+    author="Intel Scripting",
+    license="Apache 2.0",
+    packages=["use_queue_device"],
+)
diff --git a/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py
new file mode 100644
index 0000000000..899da500d5
--- /dev/null
+++ b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py
@@ -0,0 +1,39 @@
+import numpy as np
+import use_queue_device as uqd
+
+import dpctl
+
+
+def test_compute_units():
+    q = dpctl.SyclQueue()
+    mcu = uqd.get_max_compute_units(q)
+
+    assert type(mcu) is int
+    assert mcu == q.sycl_device.max_compute_units
+
+
+def test_global_memory():
+    d = dpctl.SyclDevice()
+    gm = uqd.get_device_global_mem_size(d)
+    assert type(gm) is int
+    assert gm == d.global_mem_size
+
+
+def test_local_memory():
+    d = dpctl.SyclDevice()
+    lm = uqd.get_device_local_mem_size(d)
+    assert type(lm) is int
+    assert lm == d.local_mem_size
+
+
+def test_offload_array_mod():
+    execution_queue = dpctl.SyclQueue()
+    X = np.random.randint(low=1, high=2**16 - 1, size=10**6, dtype=np.int64)
+    modulus_p = 347
+
+    # Y is a regular NumPy array with NumPy allocated host memory
+    Y = uqd.offloaded_array_mod(execution_queue, X, modulus_p)
+
+    Ynp = X % modulus_p
+
+    assert np.array_equal(Y, Ynp)
diff --git a/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py
new file mode 100644
index 0000000000..83b48b9cd7
--- /dev/null
+++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py
@@ -0,0 +1,13 @@
+from ._use_queue_device import (
+    get_device_global_mem_size,
+    get_device_local_mem_size,
+    get_max_compute_units,
+    offloaded_array_mod,
+)
+
+__all__ = [
+    "get_max_compute_units",
+    "get_device_global_mem_size",
+    "get_device_local_mem_size",
+    "offloaded_array_mod",
+]
diff --git a/examples/pybind11/use_dpctl_syclqueue/_example.cpp b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp
similarity index 98%
rename from examples/pybind11/use_dpctl_syclqueue/_example.cpp
rename to examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp
index 7b26a35858..54474a0ca1 100644
--- a/examples/pybind11/use_dpctl_syclqueue/_example.cpp
+++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp
@@ -84,7 +84,7 @@ offloaded_array_mod(sycl::queue &q,
     return res;
 }
 
-PYBIND11_MODULE(use_queue_device_ext, m)
+PYBIND11_MODULE(_use_queue_device, m)
 {
     // Import the dpctl extensions
     import_dpctl();

From 5822704c0574c9dd29e10b8277997fd6723f80b4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 25 Apr 2022 13:23:26 -0500
Subject: [PATCH 05/11] Added license headers

---
 .../external_usm_allocation/__init__.py       | 24 +++++++++++++++
 .../tests/test_dmatrix.py                     | 18 ++++++++++++
 .../tests/test_queue_device.py                | 18 ++++++++++++
 .../use_queue_device/__init__.py              | 29 +++++++++++++++++++
 4 files changed, 89 insertions(+)

diff --git a/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py
index c4a46c18e6..1af16147fa 100644
--- a/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py
+++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py
@@ -1,3 +1,27 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
 from ._external_usm_alloc import DMatrix
 
 __all__ = ["DMatrix"]
+
+__doc__ = """
+   Example of implementing C++ class with its own USM memory allocation logic
+and interfacing that allocation with `dpctl` by implementing
+`__sycl_usm_array_interface__`.
+"""
diff --git a/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py
index 0afe80eb12..96bfb951e8 100644
--- a/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py
+++ b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py
@@ -1,3 +1,21 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
 import external_usm_allocation as eua
 import numpy as np
 
diff --git a/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py
index 899da500d5..120cc1fd9e 100644
--- a/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py
+++ b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py
@@ -1,3 +1,21 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
 import numpy as np
 import use_queue_device as uqd
 
diff --git a/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py
index 83b48b9cd7..ccbca8fd73 100644
--- a/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py
+++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py
@@ -1,3 +1,21 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
 from ._use_queue_device import (
     get_device_global_mem_size,
     get_device_local_mem_size,
@@ -11,3 +29,14 @@
     "get_device_local_mem_size",
     "offloaded_array_mod",
 ]
+
+__doc__ = """
+Example pybind11 extension demonstrating binding of dpctl entities to
+SYCL entities.
+
+dpctl provides type casters that bind ``sycl::queue`` to `dpctl.SyclQueue`,
+``sycl::device`` to `dpctl.SyclDevice`, etc.
+
+Use of these type casters simplifies writing of Python extensions and compile
+then using SYCL C++ compilers, such as Intel(R) oneAPI DPC++ compiler.
+"""

From d31977fb3dcc638dfc53fa5ede74792c0f807293 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 26 Apr 2022 11:25:20 -0500
Subject: [PATCH 06/11] Added license headers and sycl_timing_solver.py file

---
 examples/pybind11/onemkl_gemv/setup.py        | 16 ++++++
 examples/pybind11/onemkl_gemv/solve.py        | 45 ++++++++++++----
 .../onemkl_gemv/sycl_gemm/__init__.py         | 16 ++++++
 .../onemkl_gemv/sycl_gemm/_onemkl.cpp         | 28 ++++++++++
 .../onemkl_gemv/sycl_timing_solver.py         | 53 +++++++++++++++++++
 5 files changed, 147 insertions(+), 11 deletions(-)
 create mode 100644 examples/pybind11/onemkl_gemv/sycl_timing_solver.py

diff --git a/examples/pybind11/onemkl_gemv/setup.py b/examples/pybind11/onemkl_gemv/setup.py
index dd00d8e1a9..1ed65e3af8 100644
--- a/examples/pybind11/onemkl_gemv/setup.py
+++ b/examples/pybind11/onemkl_gemv/setup.py
@@ -1,3 +1,19 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from skbuild import setup
 
 setup(
diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py
index b12681ab35..68bb162290 100644
--- a/examples/pybind11/onemkl_gemv/solve.py
+++ b/examples/pybind11/onemkl_gemv/solve.py
@@ -1,3 +1,19 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import sycl_gemm
 
@@ -197,15 +213,16 @@ def cg_solve_numpy(A, b):
 
 if __name__ == "__main__":
     n = 32
-    lambda_max = 4
-    lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2))))
-    # eigenvalues of cartan matrix are
     Anp = (
         2 * np.eye(n, n, k=0, dtype="d")
         + np.eye(n, n, k=1, dtype="d")
         + np.eye(n, n, k=-1, dtype="d")
     )
     bnp = np.geomspace(0.5, 2, n, dtype="d")
+    # bounds on eigenvalues of cartan matrix are, needed only
+    # for the Chebyshev solver
+    lambda_max = 4
+    lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2))))
 
     q = dpctl.SyclQueue(property="enable_profiling")
     A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q)
@@ -215,15 +232,21 @@ def cg_solve_numpy(A, b):
     t = dpctl.SyclTimer()
     with t(dev.sycl_queue):
         x, conv = cg_solve(A, b)
-    print((conv, t.dt))
+    print("SYCL solver, 1st run: ", (conv, t.dt))
     with t(dev.sycl_queue):
         x, conv = cg_solve(A, b)
-    print(dpt.asnumpy(x))
-    print((conv, t.dt))
-    with t(dev.sycl_queue):
-        print(check_with_numpy(Anp, bnp))
-    print(t.dt)
+    print("SYCL solver, 2nd run: ", (conv, t.dt))
+
+    x_ref = check_with_numpy(Anp, bnp)  # solve usign LU solver
+
     with t(dev.sycl_queue):
         x_np, conv = cg_solve_numpy(dpt.asnumpy(A), dpt.asnumpy(b))
-    print(x_np, conv)
-    print(t.dt)
+    print("NumPy's powered CG solver: ", (conv, t.dt))
+    print(
+        "SYCL cg-solver solution close to reference: ",
+        np.allclose(dpt.asnumpy(x), x_ref),
+    )
+    print(
+        "NumPy's cg-solver solution close to reference: ",
+        np.allclose(x_np, x_ref),
+    )
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
index 45af05e21b..b2939f246d 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
@@ -1,3 +1,19 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from ._onemkl import (
     axpby_inplace,
     dot_blocking,
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index 84d33bc91e..3a4cf715ab 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -1,3 +1,31 @@
+//==- _onemkl.cpp - Example of Pybind11 extension working with  -===//
+//  dpctl Python objects.
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements Pybind11-generated extension exposing functions that
+/// take dpctl Python objects, such as dpctl.SyclQueue, dpctl.SyclDevice, and
+/// dpctl.tensor.usm_ndarray as arguments.
+///
+//===----------------------------------------------------------------------===//
+
 // clang-format off
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
new file mode 100644
index 0000000000..3cd077d5a9
--- /dev/null
+++ b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
@@ -0,0 +1,53 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+# coding: utf-8
+import solve
+
+import dpctl
+import dpctl.tensor as dpt
+
+n = 1000
+rank = 16
+
+print(
+    f"Solving {n} by {n} diagonal linear "
+    f"system with rank {rank} perturbation."
+)
+
+Anp = np.eye(n, n) + (lambda x: x.T @ x)(np.random.randn(rank, n))
+bnp = np.random.rand(n)
+
+q = dpctl.SyclQueue(property=["enable_profiling"])
+q.print_device_info()
+if q.is_in_order:
+    print("Using in-order queue")
+else:
+    print("Using not in-order queue")
+
+api_dev = dpctl.tensor.Device.create_device(q)
+A = dpt.asarray(Anp, "d", device=api_dev)
+b = dpt.asarray(bnp, "d", device=api_dev)
+
+timer = dpctl.SyclTimer(time_scale=1e3)
+
+for i in range(20):
+    with timer(api_dev.sycl_queue):
+        solve.cg_solve(A, b)
+
+    print(i, timer.dt)

From d768c9b41875f267160ec1216f276e308742076d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 26 Apr 2022 11:48:54 -0500
Subject: [PATCH 07/11] Special-case b==1 case in y[i] = a*x[i] + b*y[i] kernel

---
 .../onemkl_gemv/sycl_gemm/_onemkl.cpp         | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index 3a4cf715ab..0328679dd8 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -240,6 +240,7 @@ sub(sycl::queue q,
     return std::make_pair(ht_event, res_ev);
 }
 
+template <typename T> class axpy_inplace_kern;
 template <typename T> class axpby_inplace_kern;
 
 template <typename T>
@@ -259,11 +260,20 @@ sycl::event axpby_inplace_impl(sycl::queue q,
 
     sycl::event res_ev = q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
-        cgh.parallel_for<axpby_inplace_kern<T>>(sycl::range<1>{nelems},
-                                                [=](sycl::id<1> id) {
-                                                    auto i = id.get(0);
-                                                    y[i] = a * x[i] + b * y[i];
-                                                });
+        if (b == T(1)) {
+            cgh.parallel_for<axpy_inplace_kern<T>>(sycl::range<1>{nelems},
+                                                   [=](sycl::id<1> id) {
+                                                       auto i = id.get(0);
+                                                       y[i] += a * x[i];
+                                                   });
+        }
+        else {
+            cgh.parallel_for<axpby_inplace_kern<T>>(
+                sycl::range<1>{nelems}, [=](sycl::id<1> id) {
+                    auto i = id.get(0);
+                    y[i] = b * y[i] + a * x[i];
+                });
+        }
     });
 
     return res_ev;

From bd9bc5f3be90284e5cf5dff01f1a91aaadf89f05 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 26 Apr 2022 11:49:48 -0500
Subject: [PATCH 08/11] Make comments in solve.py more readable

---
 examples/pybind11/onemkl_gemv/solve.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py
index 68bb162290..676ce9c16d 100644
--- a/examples/pybind11/onemkl_gemv/solve.py
+++ b/examples/pybind11/onemkl_gemv/solve.py
@@ -146,24 +146,27 @@ def cg_solve(A, b):
     e_p = dpctl.SyclEvent()
     e_x = dpctl.SyclEvent()
     for i in range(max_iters):
-        he_dot, e_dot = sycl_gemm.gemv(
-            exec_queue, A, p, Ap, depends=[e_p]
-        )  # Ap = A @ p
+        # Ap = A @ p
+        he_dot, e_dot = sycl_gemm.gemv(exec_queue, A, p, Ap, depends=[e_p])
         all_host_tasks.append(he_dot)
-        alpha = rsold / sycl_gemm.dot_blocking(  # alpha = rsold / dot(p, Ap)
+        # alpha = rsold / dot(p, Ap)
+        alpha = rsold / sycl_gemm.dot_blocking(
             exec_queue, p, Ap, depends=[e_dot]
         )
+        # x = x + alpha * p
         he1_axpby, e1_axpby = sycl_gemm.axpby_inplace(
             exec_queue, alpha, p, 1, x, depends=[e_p, e_x]
-        )  # x = x + alpha * p
+        )
         all_host_tasks.append(he1_axpby)
         e_x = e1_axpby
 
+        # r = r - alpha * Ap
         he2_axpby, e2_axpby = sycl_gemm.axpby_inplace(
             exec_queue, -alpha, Ap, 1, r, depends=[e_p]
-        )  # r = r - alpha * Ap
+        )
         all_host_tasks.append(he2_axpby)
 
+        # rsnew = dot(r, r)
         rsnew = sycl_gemm.norm_squared_blocking(
             exec_queue, r, depends=[e2_axpby]
         )
@@ -173,9 +176,10 @@ def cg_solve(A, b):
             break
         beta = rsnew / rsold
 
+        # p = r + beta * p
         he3_axpby, e3_axpby = sycl_gemm.axpby_inplace(
             exec_queue, 1, r, beta, p, depends=[e1_axpby, e2_axpby]
-        )  # p = r + beta * p
+        )
 
         rsold = rsnew
         all_host_tasks.append(he3_axpby)

From c11811a349975cd6f34a8698af25658f6fbd3216 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 26 Apr 2022 11:56:45 -0500
Subject: [PATCH 09/11] Added an empty line for readability

---
 examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index 0328679dd8..afd030d6f7 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -560,6 +560,7 @@ PYBIND11_MODULE(_onemkl, m)
 {
     // Import the dpctl extensions
     import_dpctl();
+
     m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)",
           py::arg("exec_queue"), py::arg("Amatrix"), py::arg("xvec"),
           py::arg("resvec"), py::arg("depends") = py::list());

From 3467eeb3717a127858792c6b1cf0609dee71bb47 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 27 Apr 2022 07:25:48 -0500
Subject: [PATCH 10/11] Factored out common routines into cg_solver.hpp

The header and routines defined there are also used in standalone_cpp
target built from cpp/main.cpp

```
(idp_2022) [08:27:56 ansatnuc04 onemkl_gemv]$ $(find . -name cmake-build)/standalone_cpp 1000 11
Solving 1000 by 1000 diagonal system with rank-11 perturbation.
Converged in : 11 , 11 , 11 , 11 , 11 , 11 ,
Wall-clock cg_solve execution times: 683.416 , 408.391 , 411.849 , 412.661 , 412.317 , 412.658 ,
Redisual norm squared: 9.15541e-25
(idp_2022) [08:28:20 ansatnuc04 onemkl_gemv]$ python sycl_timing_solver.py 1000 11
Solving 1000 by 1000 diagonal linear system with rank 11 perturbation.
    Name            Intel(R) UHD Graphics [0x9bca]
    Driver version  1.3.22992
    Vendor          Intel(R) Corporation
    Profile         FULL_PROFILE
    Filter string   level_zero:gpu:0

Using not in-order queue
0 (host_dt, device_dt)= (1156.8981241434813, 404.2941620000001)
1 (host_dt, device_dt)= (422.38221131265163, 404.5959500000001)
2 (host_dt, device_dt)= (423.1058154255152, 404.4037220000001)
3 (host_dt, device_dt)= (422.79740050435066, 403.78802800000005)
4 (host_dt, device_dt)= (424.4473725557327, 404.0881560000001)
5 (host_dt, device_dt)= (425.1241758465767, 404.55793600000004)
Converged in:  [11, 11, 11, 11, 11, 11]
Python solution residual norm squared: 3.5289368114384933e-25
```

README is also expanded.
---
 examples/pybind11/onemkl_gemv/CMakeLists.txt  |  16 +-
 examples/pybind11/onemkl_gemv/README.md       |  23 +-
 examples/pybind11/onemkl_gemv/cpp/main.cpp    | 128 ++++++++++
 examples/pybind11/onemkl_gemv/solve.py        |  24 +-
 .../onemkl_gemv/sycl_gemm/_onemkl.cpp         | 126 +++-------
 .../onemkl_gemv/sycl_gemm/cg_solver.hpp       | 226 ++++++++++++++++++
 .../onemkl_gemv/sycl_timing_solver.py         |  31 ++-
 7 files changed, 465 insertions(+), 109 deletions(-)
 create mode 100644 examples/pybind11/onemkl_gemv/cpp/main.cpp
 create mode 100644 examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp

diff --git a/examples/pybind11/onemkl_gemv/CMakeLists.txt b/examples/pybind11/onemkl_gemv/CMakeLists.txt
index 3ba71c6252..848e8c7727 100644
--- a/examples/pybind11/onemkl_gemv/CMakeLists.txt
+++ b/examples/pybind11/onemkl_gemv/CMakeLists.txt
@@ -38,7 +38,7 @@ pybind11_add_module(${py_module_name}
     sycl_gemm/_onemkl.cpp
 )
 target_include_directories(${py_module_name}
-    PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR}
+    PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
 )
 target_link_libraries(${py_module_name}
     PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
@@ -53,4 +53,18 @@ set_source_files_properties(${_sycl_gemm_sources}
   COMPILE_OPTIONS "-O3;-Wno-deprecated-declarations"
 )
 
+add_executable(standalone_cpp
+  EXCLUDE_FROM_ALL
+  cpp/main.cpp
+)
+target_compile_options(standalone_cpp
+  PRIVATE -O3 -Wno-deprecated-declarations
+)
+target_include_directories(standalone_cpp
+  PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
+  )
+target_link_libraries(standalone_cpp
+    PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
+)
+
 set(ignoreMe "${SKBUILD}")
diff --git a/examples/pybind11/onemkl_gemv/README.md b/examples/pybind11/onemkl_gemv/README.md
index 9645a4dd92..e984a5e013 100644
--- a/examples/pybind11/onemkl_gemv/README.md
+++ b/examples/pybind11/onemkl_gemv/README.md
@@ -1,9 +1,15 @@
 Example of SYCL built pybind11 extension
 
-To build, use (assumes scikit-build and dpcpp) is installed
+To build, use (assumes scikit-build and dpcpp is installed):
 
 ```sh
-python setup.py develop -- -G "Ninja" -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icpx -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
+python setup.py develop -- -G "Ninja" \
+     -DCMAKE_C_COMPILER:PATH=icx \
+     -DCMAKE_CXX_COMPILER:PATH=icpx \
+     -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib \
+     -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib \
+     -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include \
+     -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
 ```
 
 To run test suite
@@ -11,3 +17,16 @@ To run test suite
 ```sh
 python -m pytest tests
 ```
+
+To compare Python overhead,
+
+```
+# build standad-alone executable
+cmake --build $(find . -name cmake-build) --target standalone_cpp
+# execute it
+$(find . -name cmake-build)/standalone_cpp 1000 11
+# launch Python computatin
+python sycl_timing_solver.py 1000 11
+```
+
+Compare host times vs. C++ wall-clock times while making sure that the number of iterations is the same
diff --git a/examples/pybind11/onemkl_gemv/cpp/main.cpp b/examples/pybind11/onemkl_gemv/cpp/main.cpp
new file mode 100644
index 0000000000..c0f9a6115b
--- /dev/null
+++ b/examples/pybind11/onemkl_gemv/cpp/main.cpp
@@ -0,0 +1,128 @@
+#include "cg_solver.hpp"
+#include <CL/sycl.hpp>
+#include <chrono>
+#include <iostream>
+#include <oneapi/mkl.hpp>
+
+using T = double;
+
+int main(int argc, char *argv[])
+{
+    size_t n = 1000;
+    size_t rank = 16;
+
+    if (argc > 1) {
+        n = std::stoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        rank = std::stoi(argv[2]);
+    }
+
+    std::cout << "Solving " << n << " by " << n << " diagonal system with rank-"
+              << rank << " perturbation." << std::endl;
+
+    sycl::queue q;
+
+    // USM allocation for data needed by program
+    size_t buf_size = n * n + rank * n + 2 * n;
+    T *buf = sycl::malloc_device<T>(buf_size, q);
+    sycl::event memset_ev = q.fill<T>(buf, T(0), buf_size);
+
+    T *Amat = buf;
+    T *umat = buf + n * n;
+    T *bvec = umat + rank * n;
+    T *sol_vec = bvec + n;
+
+    sycl::event set_diag_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on({memset_ev});
+        cgh.parallel_for({n}, [=](sycl::id<1> id) {
+            auto i = id[0];
+            Amat[i * (n + 1)] = T(1);
+        });
+    });
+
+    oneapi::mkl::rng::philox4x32x10 engine(q, 7777);
+    oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>
+        distr(0.0, 1.0);
+
+    // populate umat and bvec in one call
+    sycl::event umat_rand_ev =
+        oneapi::mkl::rng::generate(distr, engine, n * rank + n, umat);
+
+    sycl::event syrk_ev = oneapi::mkl::blas::row_major::syrk(
+        q, oneapi::mkl::uplo::U, oneapi::mkl::transpose::N, n, rank, T(1), umat,
+        rank, T(1), Amat, n, {umat_rand_ev, set_diag_ev});
+
+    // need to transpose
+    sycl::event transpose_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(syrk_ev);
+        cgh.parallel_for({n * n}, [=](sycl::id<1> id) {
+            size_t i = id[0];
+            size_t i0 = i / n;
+            size_t i1 = i - i0 * n;
+            if (i0 > i1) {
+                Amat[i] = Amat[i1 * n + i0];
+            }
+        });
+    });
+
+    q.wait();
+
+    constexpr int reps = 6;
+
+    std::vector<double> time;
+    std::vector<int> conv_iters;
+
+    time.reserve(reps);
+    conv_iters.reserve(reps);
+    for (int i = 0; i < reps; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        int conv_iter_count = cg_solver::cg_solve(q, n, Amat, bvec, sol_vec);
+        auto end = std::chrono::high_resolution_clock::now();
+
+        time.push_back(
+            std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
+                .count() *
+            1e-06);
+
+        conv_iters.push_back(conv_iter_count);
+    }
+
+    std::cout << "Converged in : ";
+    for (auto &el : conv_iters) {
+        std::cout << el << " , ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "Wall-clock cg_solve execution times: ";
+    for (auto &el : time) {
+        std::cout << el << " , ";
+    }
+    std::cout << std::endl;
+
+    T *Ax = sycl::malloc_device<T>(2 * n + 1, q);
+    T *delta = Ax + n;
+
+    sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
+        q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, sol_vec, 1, T(0), Ax,
+        1);
+
+    sycl::event sub_ev = oneapi::mkl::vm::sub(q, n, Ax, bvec, delta, {gemv_ev},
+                                              oneapi::mkl::vm::mode::ha);
+
+    T *n2 = delta + n;
+    sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
+        q, n, delta, 1, delta, 1, n2, {sub_ev});
+
+    T n2_host{};
+    q.copy<T>(n2, &n2_host, 1, {dot_ev}).wait_and_throw();
+
+    std::cout << "Redisual norm squared: " << n2_host << std::endl;
+
+    q.wait_and_throw();
+    sycl::free(Ax, q);
+    sycl::free(buf, q);
+
+    return 0;
+}
diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py
index 676ce9c16d..3bee649afb 100644
--- a/examples/pybind11/onemkl_gemv/solve.py
+++ b/examples/pybind11/onemkl_gemv/solve.py
@@ -154,36 +154,37 @@ def cg_solve(A, b):
             exec_queue, p, Ap, depends=[e_dot]
         )
         # x = x + alpha * p
-        he1_axpby, e1_axpby = sycl_gemm.axpby_inplace(
+        he1_x_update, e1_x_update = sycl_gemm.axpby_inplace(
             exec_queue, alpha, p, 1, x, depends=[e_p, e_x]
         )
-        all_host_tasks.append(he1_axpby)
-        e_x = e1_axpby
+        all_host_tasks.append(he1_x_update)
+        e_x = e1_x_update
 
         # r = r - alpha * Ap
-        he2_axpby, e2_axpby = sycl_gemm.axpby_inplace(
+        he2_r_update, e2_r_update = sycl_gemm.axpby_inplace(
             exec_queue, -alpha, Ap, 1, r, depends=[e_p]
         )
-        all_host_tasks.append(he2_axpby)
+        all_host_tasks.append(he2_r_update)
 
         # rsnew = dot(r, r)
         rsnew = sycl_gemm.norm_squared_blocking(
-            exec_queue, r, depends=[e2_axpby]
+            exec_queue, r, depends=[e2_r_update]
         )
         if rsnew < 1e-20:
-            e1_axpby.wait()
+            e1_x_update.wait()
             converged = i
             break
         beta = rsnew / rsold
 
         # p = r + beta * p
-        he3_axpby, e3_axpby = sycl_gemm.axpby_inplace(
-            exec_queue, 1, r, beta, p, depends=[e1_axpby, e2_axpby]
+        he3_p_update, e3_p_update = sycl_gemm.axpby_inplace(
+            exec_queue, 1, r, beta, p, depends=[e2_r_update]
         )
 
         rsold = rsnew
-        all_host_tasks.append(he3_axpby)
-        e_p = e3_axpby
+        all_host_tasks.append(he3_p_update)
+        e_p = e3_p_update
+        e_x = e1_x_update
 
     dpctl.SyclEvent.wait_for(all_host_tasks)
     return x, converged
@@ -229,6 +230,7 @@ def cg_solve_numpy(A, b):
     lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2))))
 
     q = dpctl.SyclQueue(property="enable_profiling")
+    q.print_device_info()
     A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q)
     dev = A.device
     b = dpt.asarray(bnp, dtype="d", usm_type="device", device=dev)
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index afd030d6f7..080bf3135f 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -29,6 +29,7 @@
 // clang-format off
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
+#include "cg_solver.hpp"
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/complex.h>
@@ -40,11 +41,11 @@ namespace py = pybind11;
 using dpctl::utils::keep_args_alive;
 
 std::pair<sycl::event, sycl::event>
-gemv(sycl::queue q,
-     dpctl::tensor::usm_ndarray matrix,
-     dpctl::tensor::usm_ndarray vector,
-     dpctl::tensor::usm_ndarray result,
-     const std::vector<sycl::event> &depends = {})
+py_gemv(sycl::queue q,
+        dpctl::tensor::usm_ndarray matrix,
+        dpctl::tensor::usm_ndarray vector,
+        dpctl::tensor::usm_ndarray result,
+        const std::vector<sycl::event> &depends = {})
 {
     if (matrix.get_ndim() != 2 || vector.get_ndim() != 1 ||
         result.get_ndim() != 1) {
@@ -139,8 +140,6 @@ gemv(sycl::queue q,
     return std::make_pair(ht_event, res_ev);
 }
 
-template <typename T> class sub_kern;
-
 template <typename T>
 sycl::event sub_impl(sycl::queue q,
                      size_t n,
@@ -153,24 +152,18 @@ sycl::event sub_impl(sycl::queue q,
     const T *v2 = reinterpret_cast<const T *>(v2_i);
     T *r = reinterpret_cast<T *>(r_i);
 
-    sycl::event r_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.parallel_for<sub_kern<T>>(sycl::range<1>{n}, [=](sycl::id<1> id) {
-            auto i = id.get(0);
-            r[i] = v1[i] - v2[i];
-        });
-    });
+    sycl::event r_ev = cg_solver::detail::sub(q, n, v1, v2, r, depends);
 
     return r_ev;
 }
 
 // out_r = in_v1 - in_v2
 std::pair<sycl::event, sycl::event>
-sub(sycl::queue q,
-    dpctl::tensor::usm_ndarray in_v1,
-    dpctl::tensor::usm_ndarray in_v2,
-    dpctl::tensor::usm_ndarray out_r,
-    const std::vector<sycl::event> &depends = {})
+py_sub(sycl::queue q,
+       dpctl::tensor::usm_ndarray in_v1,
+       dpctl::tensor::usm_ndarray in_v2,
+       dpctl::tensor::usm_ndarray out_r,
+       const std::vector<sycl::event> &depends = {})
 {
     if (in_v1.get_ndim() != 1 || in_v2.get_ndim() != 1 || out_r.get_ndim() != 1)
     {
@@ -258,35 +251,20 @@ sycl::event axpby_inplace_impl(sycl::queue q,
     const T *x = reinterpret_cast<const T *>(x_typeless);
     T *y = reinterpret_cast<T *>(y_typeless);
 
-    sycl::event res_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        if (b == T(1)) {
-            cgh.parallel_for<axpy_inplace_kern<T>>(sycl::range<1>{nelems},
-                                                   [=](sycl::id<1> id) {
-                                                       auto i = id.get(0);
-                                                       y[i] += a * x[i];
-                                                   });
-        }
-        else {
-            cgh.parallel_for<axpby_inplace_kern<T>>(
-                sycl::range<1>{nelems}, [=](sycl::id<1> id) {
-                    auto i = id.get(0);
-                    y[i] = b * y[i] + a * x[i];
-                });
-        }
-    });
+    sycl::event res_ev =
+        cg_solver::detail::axpby_inplace(q, nelems, a, x, b, y, depends);
 
     return res_ev;
 }
 
 // y = a * x + b * y
 std::pair<sycl::event, sycl::event>
-axpby_inplace(sycl::queue q,
-              py::object a,
-              dpctl::tensor::usm_ndarray x,
-              py::object b,
-              dpctl::tensor::usm_ndarray y,
-              const std::vector<sycl::event> &depends = {})
+py_axpby_inplace(sycl::queue q,
+                 py::object a,
+                 dpctl::tensor::usm_ndarray x,
+                 py::object b,
+                 dpctl::tensor::usm_ndarray y,
+                 const std::vector<sycl::event> &depends = {})
 {
 
     if (x.get_ndim() != 1 || y.get_ndim() != 1) {
@@ -352,8 +330,6 @@ axpby_inplace(sycl::queue q,
     return std::make_pair(ht_event, res_ev);
 }
 
-template <typename T> class norm_squared_blocking_kern;
-
 template <typename T>
 T norm_squared_blocking_impl(sycl::queue q,
                              size_t nelems,
@@ -362,23 +338,7 @@ T norm_squared_blocking_impl(sycl::queue q,
 {
     const T *r = reinterpret_cast<const T *>(r_typeless);
 
-    sycl::buffer<T, 1> sum_sq_buf(sycl::range<1>{1});
-
-    q.submit([&](sycl::handler &cgh) {
-         cgh.depends_on(depends);
-         auto sum_sq_reduction = sycl::reduction(
-             sum_sq_buf, cgh, sycl::plus<T>(),
-             {sycl::property::reduction::initialize_to_identity{}});
-         cgh.parallel_for<norm_squared_blocking_kern<T>>(
-             sycl::range<1>{nelems}, sum_sq_reduction,
-             [=](sycl::id<1> id, auto &sum_sq) {
-                 auto i = id.get(0);
-                 sum_sq += r[i] * r[i];
-             });
-     }).wait_and_throw();
-
-    sycl::host_accessor ha(sum_sq_buf);
-    return ha[0];
+    return cg_solver::detail::norm_squared_blocking(q, nelems, r, depends);
 }
 
 template <typename T> class complex_norm_squared_blocking_kern;
@@ -393,29 +353,13 @@ T complex_norm_squared_blocking_impl(
     const std::complex<T> *r =
         reinterpret_cast<const std::complex<T> *>(r_typeless);
 
-    sycl::buffer<T, 1> sum_sq_buf(sycl::range<1>{1});
-
-    q.submit([&](sycl::handler &cgh) {
-         cgh.depends_on(depends);
-         auto sum_sq_reduction = sycl::reduction(
-             sum_sq_buf, cgh, sycl::plus<T>(),
-             {sycl::property::reduction::initialize_to_identity{}});
-         cgh.parallel_for<complex_norm_squared_blocking_kern<T>>(
-             sycl::range<1>{nelems}, sum_sq_reduction,
-             [=](sycl::id<1> id, auto &sum_sq) {
-                 auto i = id.get(0);
-                 sum_sq +=
-                     r[i].real() * r[i].real() + r[i].imag() * r[i].imag();
-             });
-     }).wait_and_throw();
-
-    sycl::host_accessor ha(sum_sq_buf);
-    return ha[0];
+    return cg_solver::detail::complex_norm_squared_blocking(q, nelems, r,
+                                                            depends);
 }
 
-py::object norm_squared_blocking(sycl::queue q,
-                                 dpctl::tensor::usm_ndarray r,
-                                 const std::vector<sycl::event> depends = {})
+py::object py_norm_squared_blocking(sycl::queue q,
+                                    dpctl::tensor::usm_ndarray r,
+                                    const std::vector<sycl::event> depends = {})
 {
     if (r.get_ndim() != 1) {
         throw std::runtime_error("Expecting a vector");
@@ -469,10 +413,10 @@ py::object norm_squared_blocking(sycl::queue q,
     return res;
 }
 
-py::object dot_blocking(sycl::queue q,
-                        dpctl::tensor::usm_ndarray v1,
-                        dpctl::tensor::usm_ndarray v2,
-                        const std::vector<sycl::event> &depends = {})
+py::object py_dot_blocking(sycl::queue q,
+                           dpctl::tensor::usm_ndarray v1,
+                           dpctl::tensor::usm_ndarray v2,
+                           const std::vector<sycl::event> &depends = {})
 {
     if (v1.get_ndim() != 1 || v2.get_ndim() != 1) {
         throw std::runtime_error("Expecting two vectors");
@@ -561,17 +505,17 @@ PYBIND11_MODULE(_onemkl, m)
     // Import the dpctl extensions
     import_dpctl();
 
-    m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)",
+    m.def("gemv", &py_gemv, "Uses oneMKL to compute dot(matrix, vector)",
           py::arg("exec_queue"), py::arg("Amatrix"), py::arg("xvec"),
           py::arg("resvec"), py::arg("depends") = py::list());
-    m.def("sub", &sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"),
+    m.def("sub", &py_sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"),
           py::arg("in1"), py::arg("in2"), py::arg("out"),
           py::arg("depends") = py::list());
-    m.def("axpby_inplace", &axpby_inplace, "y = a * x + b * y",
+    m.def("axpby_inplace", &py_axpby_inplace, "y = a * x + b * y",
           py::arg("exec_queue"), py::arg("a"), py::arg("x"), py::arg("b"),
           py::arg("y"), py::arg("depends") = py::list());
-    m.def("norm_squared_blocking", &norm_squared_blocking, "norm(r)**2",
+    m.def("norm_squared_blocking", &py_norm_squared_blocking, "norm(r)**2",
           py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list());
-    m.def("dot_blocking", &dot_blocking, "<v1, v2>", py::arg("exec_queue"),
+    m.def("dot_blocking", &py_dot_blocking, "<v1, v2>", py::arg("exec_queue"),
           py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list());
 }
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp b/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp
new file mode 100644
index 0000000000..6e0396c7a1
--- /dev/null
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp
@@ -0,0 +1,226 @@
+//==- _cg_solver.cpp - C++ impl of conjugate gradient linear solver  -==//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements Pybind11-generated extension exposing functions that
+/// take dpctl Python objects, such as dpctl.SyclQueue, dpctl.SyclDevice, and
+/// dpctl.tensor.usm_ndarray as arguments.
+///
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+namespace cg_solver
+{
+namespace detail
+{
+
+template <typename T> class sub_kern;
+
+template <typename T>
+sycl::event sub(sycl::queue &q,
+                size_t n,
+                const T *v1,
+                const T *v2,
+                T *r,
+                const std::vector<sycl::event> &depends = {})
+{
+    sycl::event r_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.parallel_for<sub_kern<T>>(sycl::range<1>{n}, [=](sycl::id<1> id) {
+            auto i = id.get(0);
+            r[i] = v1[i] - v2[i];
+        });
+    });
+
+    return r_ev;
+}
+
+template <typename T> class axpy_inplace_kern;
+template <typename T> class axpby_inplace_kern;
+
+template <typename T>
+sycl::event axpby_inplace(sycl::queue &q,
+                          size_t nelems,
+                          T a,
+                          const T *x,
+                          T b,
+                          T *y,
+                          const std::vector<sycl::event> depends = {})
+{
+
+    sycl::event res_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        if (b == T(1)) {
+            cgh.parallel_for<axpy_inplace_kern<T>>(sycl::range<1>{nelems},
+                                                   [=](sycl::id<1> id) {
+                                                       auto i = id.get(0);
+                                                       y[i] += a * x[i];
+                                                   });
+        }
+        else {
+            cgh.parallel_for<axpby_inplace_kern<T>>(
+                sycl::range<1>{nelems}, [=](sycl::id<1> id) {
+                    auto i = id.get(0);
+                    y[i] = b * y[i] + a * x[i];
+                });
+        }
+    });
+
+    return res_ev;
+}
+
+template <typename T> class norm_squared_blocking_kern;
+
+template <typename T>
+T norm_squared_blocking(sycl::queue &q,
+                        size_t nelems,
+                        const T *r,
+                        const std::vector<sycl::event> &depends = {})
+{
+    sycl::buffer<T, 1> sum_sq_buf(sycl::range<1>{1});
+
+    q.submit([&](sycl::handler &cgh) {
+         cgh.depends_on(depends);
+         auto sum_sq_reduction = sycl::reduction(
+             sum_sq_buf, cgh, sycl::plus<T>(),
+             {sycl::property::reduction::initialize_to_identity{}});
+         cgh.parallel_for<norm_squared_blocking_kern<T>>(
+             sycl::range<1>{nelems}, sum_sq_reduction,
+             [=](sycl::id<1> id, auto &sum_sq) {
+                 auto i = id.get(0);
+                 sum_sq += r[i] * r[i];
+             });
+     }).wait_and_throw();
+
+    sycl::host_accessor ha(sum_sq_buf);
+    return ha[0];
+}
+
+template <typename T> class complex_norm_squared_blocking_kern;
+
+template <typename T>
+T complex_norm_squared_blocking(sycl::queue &q,
+                                size_t nelems,
+                                const std::complex<T> *r,
+                                const std::vector<sycl::event> &depends = {})
+{
+    sycl::buffer<T, 1> sum_sq_buf(sycl::range<1>{1});
+
+    q.submit([&](sycl::handler &cgh) {
+         cgh.depends_on(depends);
+         auto sum_sq_reduction = sycl::reduction(
+             sum_sq_buf, cgh, sycl::plus<T>(),
+             {sycl::property::reduction::initialize_to_identity{}});
+         cgh.parallel_for<complex_norm_squared_blocking_kern<T>>(
+             sycl::range<1>{nelems}, sum_sq_reduction,
+             [=](sycl::id<1> id, auto &sum_sq) {
+                 auto i = id.get(0);
+                 sum_sq +=
+                     r[i].real() * r[i].real() + r[i].imag() * r[i].imag();
+             });
+     }).wait_and_throw();
+
+    sycl::host_accessor ha(sum_sq_buf);
+    return ha[0];
+}
+
+} // namespace detail
+
+template <typename T>
+int cg_solve(sycl::queue exec_q,
+             std::int64_t n,
+             const T *Amat,
+             const T *bvec,
+             T *sol_vec,
+             const std::vector<sycl::event> &depends = {},
+             T rs_threshold = T(1e-20))
+{
+    T *x_vec = sol_vec;
+    sycl::event fill_ev = exec_q.fill<T>(x_vec, T(0), n, depends);
+
+    // n for r, n for p, n for Ap and 1 for pAp_dot_dev
+    T *r = sycl::malloc_device<T>(3 * n + 1, exec_q);
+    T *p = r + n;
+    T *Ap = p + n;
+    T *pAp_dot_dev = Ap + n;
+
+    sycl::event copy_to_r_ev = exec_q.copy<T>(bvec, r, n, depends);
+    sycl::event copy_to_p_ev = exec_q.copy<T>(bvec, p, n, depends);
+    T rs_old = detail::norm_squared_blocking(exec_q, n, r, {copy_to_r_ev});
+
+    std::int64_t max_iters = n;
+
+    if (rs_old < rs_threshold) {
+        sycl::free(r, exec_q);
+        return 0;
+    }
+
+    int converged_at = max_iters;
+    sycl::event prev_dep = copy_to_p_ev;
+    sycl::event e_x = fill_ev;
+
+    for (std::int64_t i = 0; i < max_iters; ++i) {
+        sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
+            exec_q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, p, 1, T(0),
+            Ap, 1, {prev_dep});
+
+        sycl::event pAp_dot_ev = oneapi::mkl::blas::row_major::dot(
+            exec_q, n, p, 1, Ap, 1, pAp_dot_dev, {prev_dep, gemv_ev});
+
+        T pAp_dot_host{};
+        exec_q.copy<T>(pAp_dot_dev, &pAp_dot_host, 1, {pAp_dot_ev})
+            .wait_and_throw();
+        T alpha = rs_old / pAp_dot_host;
+
+        // x = x + alpha * p
+        sycl::event x_update_ev =
+            detail::axpby_inplace(exec_q, n, alpha, p, T(1), x_vec, {e_x});
+
+        // r = r - alpha * Ap
+        sycl::event r_update_ev =
+            detail::axpby_inplace(exec_q, n, -alpha, Ap, T(1), r);
+
+        T rs_new = detail::norm_squared_blocking(exec_q, n, r, {r_update_ev});
+
+        if (rs_new < rs_threshold) {
+            converged_at = i;
+            x_update_ev.wait();
+            break;
+        }
+
+        T beta = rs_new / rs_old;
+
+        // p = r + beta * p
+        prev_dep =
+            detail::axpby_inplace(exec_q, n, T(1), r, beta, p, {r_update_ev});
+        e_x = x_update_ev;
+
+        rs_old = rs_new;
+    }
+
+    sycl::free(r, exec_q);
+    return converged_at;
+}
+
+} // namespace cg_solver
diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
index 3cd077d5a9..048c9f00f7 100644
--- a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
+++ b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
@@ -14,16 +14,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+
 import numpy as np
 
 # coding: utf-8
 import solve
+import sycl_gemm
 
 import dpctl
 import dpctl.tensor as dpt
 
+argv = sys.argv
+
 n = 1000
-rank = 16
+rank = 11
+
+if len(argv) > 1:
+    n = int(argv[1])
+if len(argv) > 2:
+    rank = int(argv[2])
+
 
 print(
     f"Solving {n} by {n} diagonal linear "
@@ -46,8 +57,20 @@
 
 timer = dpctl.SyclTimer(time_scale=1e3)
 
-for i in range(20):
+iters = []
+for i in range(6):
     with timer(api_dev.sycl_queue):
-        solve.cg_solve(A, b)
+        x, conv_in = solve.cg_solve(A, b)
+
+    print(i, "(host_dt, device_dt)=", timer.dt)
+    iters.append(conv_in)
+
+print("Converged in: ", iters)
 
-    print(i, timer.dt)
+r = dpt.empty_like(b)
+hev, ev = sycl_gemm.gemv(q, A, x, r)
+delta = dpt.empty_like(b)
+hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev])
+rs = sycl_gemm.norm_squared_blocking(q, delta)
+dpctl.SyclEvent.wait_for([hev, hev2])
+print(f"Python solution residual norm squared: {rs}")

From fc7041eaed2faa021f6b270cf3a285776e31883b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 27 Apr 2022 12:08:54 -0500
Subject: [PATCH 11/11] Streamlined Chebyshev solver

Expose cpp_cg_solve used in standalone_cpp executable from Python.
Invoked that from Python script sycl_timing_solver.py

```bash
$ python sycl_timing_solver.py 1000 11
Solving 1000 by 1000 diagonal linear system with rank 11 perturbation.
    Name            Intel(R) UHD Graphics [0x9bca]
    Driver version  1.3.22992
    Vendor          Intel(R) Corporation
    Profile         FULL_PROFILE
    Filter string   level_zero:gpu:0

Using not in-order queue
0 (host_dt, device_dt)= (1157.4030127376318, 403.9605020000001)
1 (host_dt, device_dt)= (421.32044583559036, 403.45619400000004)
2 (host_dt, device_dt)= (420.66121101379395, 402.57058400000005)
3 (host_dt, device_dt)= (421.5433243662119, 402.9254920000001)
4 (host_dt, device_dt)= (421.9988752156496, 402.8818340000001)
5 (host_dt, device_dt)= (422.3589450120926, 402.63814600000006)
Converged in:  [11, 11, 11, 11, 11, 11]
Python solution residual norm squared: 3.2839902926527995e-25
0 (host_dt, device_dt)= (412.9443597048521, 403.6290000000001)
1 (host_dt, device_dt)= (413.7023724615574, 403.8434720000001)
2 (host_dt, device_dt)= (413.4188834577799, 403.1985620000001)
3 (host_dt, device_dt)= (413.85203413665295, 402.70404800000006)
4 (host_dt, device_dt)= (416.2806496024132, 404.0513040000001)
5 (host_dt, device_dt)= (417.43320040404797, 404.74999800000006)
Converged in:  [11, 11, 11, 11, 11, 11]
cpp_cg_solve solution residual norm squared: 3.218393087932091e-25
```
---
 examples/pybind11/onemkl_gemv/solve.py        |  40 +++---
 .../onemkl_gemv/sycl_gemm/__init__.py         |   2 +
 .../onemkl_gemv/sycl_gemm/_onemkl.cpp         | 116 ++++++++++++++++++
 .../onemkl_gemv/sycl_timing_solver.py         |  16 +++
 4 files changed, 154 insertions(+), 20 deletions(-)

diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py
index 3bee649afb..6304808078 100644
--- a/examples/pybind11/onemkl_gemv/solve.py
+++ b/examples/pybind11/onemkl_gemv/solve.py
@@ -38,12 +38,10 @@ def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]):
     p = empty_like(Ax)
 
     e_x = dpctl.SyclEvent()
-    he_dot, e_dot = sycl_gemm.gemv(
-        exec_queue, A, x, Ax, depends=depends
-    )  # Ax = A @ x
-    he_sub, e_sub = sycl_gemm.sub(
-        exec_queue, b, Ax, r, depends=[e_dot]
-    )  # r = b - Ax
+    # Ax = A @ x
+    _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=depends)
+    # r = b - Ax
+    _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot])
     r_ev = e_sub
     for i in range(nIters):
         z = r
@@ -51,31 +49,33 @@ def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]):
         if i == 0:
             p[:] = z
             alpha = 1 / d
-            he_axpby, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent()
+            _, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent()
         elif i == 1:
             beta = 0.5 * (c * alpha) ** 2
             alpha = 1 / (d - beta / alpha)
-            he_axpby, e_axpby = sycl_gemm.axpby_inplace(
+            # p = z + beta * p
+            _, e_axpby = sycl_gemm.axpby_inplace(
                 exec_queue, 1, z, beta, p, depends=[z_ev]
-            )  # p = z + beta * p
+            )
         else:
             beta = (c / 2 * alpha) ** 2
             alpha = 1 / (d - beta / alpha)
-            he_axpby, e_axpby = sycl_gemm.axpby_inplace(
+            # p = z + beta * p
+            _, e_axpby = sycl_gemm.axpby_inplace(
                 exec_queue, 1, z, beta, p, depends=[z_ev]
-            )  # p = z + beta * p
-        h_x, e_x = sycl_gemm.axpby_inplace(
+            )
+        # x = x + alpha * p
+        _, e_x = sycl_gemm.axpby_inplace(
             exec_queue, alpha, p, 1, x, depends=[e_axpby, e_x]
-        )  # x = x + alpha * p
-        he_dot, e_dot = sycl_gemm.gemv(
-            exec_queue, A, x, Ax, depends=[e_x]
-        )  # Ax = A @ x
-        he_sub, e_sub = sycl_gemm.sub(
-            exec_queue, b, Ax, r, depends=[e_dot]
-        )  # r = b - Ax
+        )
+        # Ax = A @ x
+        _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=[e_x])
+        # r = b - Ax
+        _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot])
+        # residual = dot(r, r)
         residual = sycl_gemm.norm_squared_blocking(
             exec_queue, r, depends=[e_sub]
-        )  # residual = dot(r, r)
+        )
         if residual <= 1e-29:
             print(f"chebyshev: converged in {i} iters")
             break
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
index b2939f246d..defce5b76f 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
@@ -16,6 +16,7 @@
 
 from ._onemkl import (
     axpby_inplace,
+    cpp_cg_solve,
     dot_blocking,
     gemv,
     norm_squared_blocking,
@@ -28,4 +29,5 @@
     "axpby_inplace",
     "norm_squared_blocking",
     "dot_blocking",
+    "cpp_cg_solve",
 ]
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index 080bf3135f..0531b8880f 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -62,6 +62,15 @@ py_gemv(sycl::queue q,
         throw std::runtime_error("Inconsistent shapes.");
     }
 
+    auto q_ctx = q.get_context();
+    if (q_ctx != matrix.get_queue().get_context() ||
+        q_ctx != vector.get_queue().get_context() ||
+        q_ctx != result.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue.");
+    }
+
     int mat_flags = matrix.get_flags();
     int v_flags = vector.get_flags();
     int r_flags = result.get_flags();
@@ -176,6 +185,14 @@ py_sub(sycl::queue q,
         throw std::runtime_error("Vectors must have the same length");
     }
 
+    if (q.get_context() != in_v1.get_queue().get_context() ||
+        q.get_context() != in_v2.get_queue().get_context() ||
+        q.get_context() != out_r.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
     int in_v1_flags = in_v1.get_flags();
     int in_v2_flags = in_v2.get_flags();
     int out_r_flags = out_r.get_flags();
@@ -277,6 +294,13 @@ py_axpby_inplace(sycl::queue q,
         throw std::runtime_error("Vectors must have the same length");
     }
 
+    if (q.get_context() != x.get_queue().get_context() ||
+        q.get_context() != y.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
     int x_flags = x.get_flags();
     int y_flags = y.get_flags();
 
@@ -373,6 +397,11 @@ py::object py_norm_squared_blocking(sycl::queue q,
         throw std::runtime_error("Vector must be contiguous.");
     }
 
+    if (q.get_context() != r.get_queue().get_context()) {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
     int r_typenum = r.get_typenum();
     if ((r_typenum != UAR_DOUBLE) && (r_typenum != UAR_FLOAT) &&
         (r_typenum != UAR_CDOUBLE) && (r_typenum != UAR_CFLOAT))
@@ -437,6 +466,13 @@ py::object py_dot_blocking(sycl::queue q,
         throw std::runtime_error("Vectors must be contiguous.");
     }
 
+    if (q.get_context() != v1.get_queue().get_context() ||
+        q.get_context() != v2.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
     int v1_typenum = v1.get_typenum();
     int v2_typenum = v2.get_typenum();
 
@@ -500,6 +536,80 @@ py::object py_dot_blocking(sycl::queue q,
     return res;
 }
 
+int py_cg_solve(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray Amat,
+                dpctl::tensor::usm_ndarray bvec,
+                dpctl::tensor::usm_ndarray xvec,
+                double rs_tol,
+                const std::vector<sycl::event> &depends = {})
+{
+    if (Amat.get_ndim() != 2 || bvec.get_ndim() != 1 || xvec.get_ndim() != 1) {
+        throw py::value_error("Expecting a matrix and two vectors");
+    }
+
+    py::ssize_t n0 = Amat.get_shape(0);
+    py::ssize_t n1 = Amat.get_shape(1);
+
+    if (n0 != n1) {
+        throw py::value_error("Matrix must be square.");
+    }
+
+    if (n0 != bvec.get_shape(0) || n0 != xvec.get_shape(0)) {
+        throw py::value_error(
+            "Dimensions of the matrix and vectors are not consistent.");
+    }
+
+    bool all_contig = (Amat.get_flags() & USM_ARRAY_C_CONTIGUOUS) &&
+                      (bvec.get_flags() & USM_ARRAY_C_CONTIGUOUS) &&
+                      (xvec.get_flags() & USM_ARRAY_C_CONTIGUOUS);
+    if (!all_contig) {
+        throw py::value_error("All inputs must be C-contiguous");
+    }
+
+    int A_typenum = Amat.get_typenum();
+    int b_typenum = bvec.get_typenum();
+    int x_typenum = xvec.get_typenum();
+
+    if (A_typenum != b_typenum || A_typenum != x_typenum) {
+        throw py::value_error("All arrays must have the same type");
+    }
+
+    if (exec_q.get_context() != Amat.get_queue().get_context() ||
+        exec_q.get_context() != bvec.get_queue().get_context() ||
+        exec_q.get_context() != xvec.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocations are not bound to context in execution queue");
+    }
+
+    const char *A_ch = Amat.get_data();
+    const char *b_ch = bvec.get_data();
+    char *x_ch = xvec.get_data();
+
+    if (A_typenum == UAR_DOUBLE) {
+        using T = double;
+        int iters = cg_solver::cg_solve<T>(
+            exec_q, n0, reinterpret_cast<const T *>(A_ch),
+            reinterpret_cast<const T *>(b_ch), reinterpret_cast<T *>(x_ch),
+            depends, static_cast<T>(rs_tol));
+
+        return iters;
+    }
+    else if (A_typenum == UAR_FLOAT) {
+        using T = float;
+        int iters = cg_solver::cg_solve<T>(
+            exec_q, n0, reinterpret_cast<const T *>(A_ch),
+            reinterpret_cast<const T *>(b_ch), reinterpret_cast<T *>(x_ch),
+            depends, static_cast<T>(rs_tol));
+
+        return iters;
+    }
+    else {
+        throw std::runtime_error(
+            "Unsupported data type. Use single or double precision.");
+    }
+}
+
 PYBIND11_MODULE(_onemkl, m)
 {
     // Import the dpctl extensions
@@ -518,4 +628,10 @@ PYBIND11_MODULE(_onemkl, m)
           py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list());
     m.def("dot_blocking", &py_dot_blocking, "<v1, v2>", py::arg("exec_queue"),
           py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list());
+
+    m.def("cpp_cg_solve", &py_cg_solve,
+          "Dispatch to call C++ implementation of cg_solve",
+          py::arg("exec_queue"), py::arg("Amat"), py::arg("bvec"),
+          py::arg("xvec"), py::arg("rs_squared_tolerance") = py::float_(1e-20),
+          py::arg("depends") = py::list());
 }
diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
index 048c9f00f7..42178ee0ed 100644
--- a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
+++ b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
@@ -74,3 +74,19 @@
 rs = sycl_gemm.norm_squared_blocking(q, delta)
 dpctl.SyclEvent.wait_for([hev, hev2])
 print(f"Python solution residual norm squared: {rs}")
+
+x_cpp = dpt.empty_like(b)
+iters = []
+for i in range(6):
+    with timer(api_dev.sycl_queue):
+        conv_in = sycl_gemm.cpp_cg_solve(q, A, b, x_cpp)
+
+    print(i, "(host_dt, device_dt)=", timer.dt)
+    iters.append(conv_in)
+
+print("Converged in: ", iters)
+hev, ev = sycl_gemm.gemv(q, A, x_cpp, r)
+hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev])
+rs = sycl_gemm.norm_squared_blocking(q, delta)
+dpctl.SyclEvent.wait_for([hev, hev2])
+print(f"cpp_cg_solve solution residual norm squared: {rs}")