From 70f684253ff02bb1f53442c7bc315c2973a75084 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 23 Apr 2022 22:20:54 -0500 Subject: [PATCH 01/11] onemkl_gemv example expanded: chebyshev example working --- examples/pybind11/onemkl_gemv/chebyshev.py | 128 ++++++++ .../onemkl_gemv/sycl_gemm/__init__.py | 4 +- .../onemkl_gemv/sycl_gemm/_onemkl.cpp | 305 +++++++++++++++++- 3 files changed, 434 insertions(+), 3 deletions(-) create mode 100644 examples/pybind11/onemkl_gemv/chebyshev.py diff --git a/examples/pybind11/onemkl_gemv/chebyshev.py b/examples/pybind11/onemkl_gemv/chebyshev.py new file mode 100644 index 0000000000..36cf4fbbf0 --- /dev/null +++ b/examples/pybind11/onemkl_gemv/chebyshev.py @@ -0,0 +1,128 @@ +import numpy as np +import sycl_gemm + +import dpctl +import dpctl.tensor as dpt + + +def empty_like(A): + return dpt.empty(A.shape, A.dtype, device=A.device) + + +def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]): + """Chebyshev iterative solver using SYCL routines""" + d = (lMax + lMin) / 2 + c = (lMax - lMin) / 2 + + x = dpt.copy(x0) + exec_queue = A.sycl_queue + assert exec_queue == x.sycl_queue + Ax = empty_like(A[:, 0]) + r = empty_like(Ax) + p = empty_like(Ax) + + e_x = dpctl.SyclEvent() + he_dot, e_dot = sycl_gemm.gemv( + exec_queue, A, x, Ax, depends=depends + ) # Ax = A @ x + he_sub, e_sub = sycl_gemm.sub( + exec_queue, b, Ax, r, depends=[e_dot] + ) # r = b - Ax + r_ev = e_sub + for i in range(nIters): + z = r + z_ev = r_ev + if i == 0: + p[:] = z + alpha = 1 / d + he_axbpy, e_axbpy = dpctl.SyclEvent(), dpctl.SyclEvent() + elif i == 1: + beta = 0.5 * (c * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace( + exec_queue, 1, z, beta, p, depends=[z_ev] + ) # p = z + beta * p + else: + beta = (c / 2 * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace( + exec_queue, 1, z, beta, p, depends=[z_ev] + ) # p = z + beta * p + h_x, e_x = sycl_gemm.axbpy_inplace( + exec_queue, alpha, p, 1, x, depends=[e_axbpy, e_x] + ) # x = x + alpha * p + he_dot, e_dot = sycl_gemm.gemv( + exec_queue, A, x, Ax, depends=[e_x] + ) # Ax = A @ x + he_sub, e_sub = sycl_gemm.sub( + exec_queue, b, Ax, r, depends=[e_dot] + ) # r = b - Ax + residual = sycl_gemm.norm_squared_blocking( + exec_queue, r, depends=[e_sub] + ) # residual = dot(r, r) + if residual <= 1e-29: + print(f"chebyshev: converged in {i} iters") + break + return x + + +def check_with_numpy(A, b): + """Direct solver using numpy""" + import numpy as np + + Anp = dpt.asnumpy(A) + bnp = dpt.asnumpy(b) + return np.linalg.solve(Anp, bnp) + + +def chebyshev_numpy(A, b, x0, nIters, lMax, lMin): + """Chebyshev iterative solver using numpy""" + d = (lMax + lMin) / 2 + c = (lMax - lMin) / 2 + + x = x0 + + Ax = A @ x + r = b - Ax + for i in range(nIters): + z = r + if i == 0: + p = z + alpha = 1 / d + elif i == 1: + beta = 0.5 * (c * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + p = z + beta * p + else: + beta = (c / 2 * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + p = z + beta * p + x += alpha * p + Ax = A @ x + r = b - Ax + residual = np.dot(r, r) + if residual <= 1e-29: + print(f"chebyshev_numpy: converged in {i} iters") + break + return x + + +if __name__ == "__main__": + A = dpt.asarray( + [[2, 1, 0, 0], [1, 2, 1, 0], [0, 1, 2, 1], [0, 0, 1, 2]], + dtype="d", + usm_type="device", + ) + dev = A.device + b = dpt.asarray( + [0.5, 0.7, 0.3, 0.1], dtype="d", usm_type="device", device=dev + ) + x0 = b + x = chebyshev(A, b, x0, 100, 4.0, 0.25) + print(dpt.asnumpy(x)) + print(check_with_numpy(A, b)) + print( + chebyshev_numpy( + dpt.asnumpy(A), dpt.asnumpy(b), dpt.asnumpy(b), 100, 4.0, 0.25 + ) + ) diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py index 07d7ec6ced..df8b5129fa 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py @@ -1,3 +1,3 @@ -from ._onemkl import gemv +from ._onemkl import axbpy_inplace, gemv, norm_squared_blocking, sub -__all__ = ["gemv"] +__all__ = ["gemv", "sub", "axbpy_inplace", "norm_squared_blocking"] diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 569428004f..1fb7a951d6 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "dpctl4pybind11.hpp" // clang-format on @@ -110,9 +111,311 @@ gemv(sycl::queue q, return std::make_pair(ht_event, res_ev); } +template class sub_kern; + +template +sycl::event sub_impl(sycl::queue q, + size_t n, + const char *v1_i, + const char *v2_i, + char *r_i, + const std::vector &depends = {}) +{ + const T *v1 = reinterpret_cast(v1_i); + const T *v2 = reinterpret_cast(v2_i); + T *r = reinterpret_cast(r_i); + + sycl::event r_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>(sycl::range<1>{n}, [=](sycl::id<1> id) { + auto i = id.get(0); + r[i] = v1[i] - v2[i]; + }); + }); + + return r_ev; +} + +// out_r = in_v1 - in_v2 +std::pair +sub(sycl::queue q, + dpctl::tensor::usm_ndarray in_v1, + dpctl::tensor::usm_ndarray in_v2, + dpctl::tensor::usm_ndarray out_r, + const std::vector &depends = {}) +{ + if (in_v1.get_ndim() != 1 || in_v2.get_ndim() != 1 || out_r.get_ndim() != 1) + { + throw std::runtime_error("Inconsistent dimensions, expecting vectors"); + } + + py::ssize_t n = in_v1.get_shape(0); // get length of the vector + + if (n != in_v2.get_shape(0) || n != out_r.get_shape(0)) { + throw std::runtime_error("Vectors must have the same length"); + } + + int in_v1_flags = in_v1.get_flags(); + int in_v2_flags = in_v2.get_flags(); + int out_r_flags = out_r.get_flags(); + + if (!((in_v1_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) && + (in_v2_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) && + (out_r_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)))) + { + throw std::runtime_error("Vectors must be contiguous."); + } + + int in_v1_typenum = in_v1.get_typenum(); + int in_v2_typenum = in_v2.get_typenum(); + int out_r_typenum = out_r.get_typenum(); + + if ((in_v2_typenum != in_v1_typenum) || (out_r_typenum != in_v1_typenum) || + !((in_v1_typenum == UAR_DOUBLE) || (in_v1_typenum == UAR_FLOAT) || + (in_v1_typenum == UAR_CDOUBLE) || (in_v1_typenum == UAR_CFLOAT))) + { + throw std::runtime_error( + "Only real and complex floating point arrays are supported."); + } + + const char *in_v1_typeless_ptr = in_v1.get_data(); + const char *in_v2_typeless_ptr = in_v2.get_data(); + char *out_r_typeless_ptr = out_r.get_data(); + + sycl::event res_ev; + if (out_r_typenum == UAR_DOUBLE) { + using T = double; + res_ev = sub_impl(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr, + out_r_typeless_ptr, depends); + } + else if (out_r_typenum == UAR_FLOAT) { + using T = float; + res_ev = sub_impl(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr, + out_r_typeless_ptr, depends); + } + else if (out_r_typenum == UAR_CDOUBLE) { + using T = std::complex; + res_ev = sub_impl(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr, + out_r_typeless_ptr, depends); + } + else if (out_r_typenum == UAR_CFLOAT) { + using T = std::complex; + res_ev = sub_impl(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr, + out_r_typeless_ptr, depends); + } + else { + throw std::runtime_error("Type dispatch ran into trouble."); + } + + sycl::event ht_event = keep_args_alive(q, {in_v1, in_v2, out_r}, {res_ev}); + + return std::make_pair(ht_event, res_ev); +} + +template class axbpy_inplace_kern; + +template +sycl::event axbpy_inplace_impl(sycl::queue q, + size_t nelems, + py::object pyobj_a, + const char *x_typeless, + py::object pyobj_b, + char *y_typeless, + const std::vector depends = {}) +{ + T a = py::cast(pyobj_a); + T b = py::cast(pyobj_b); + + const T *x = reinterpret_cast(x_typeless); + T *y = reinterpret_cast(y_typeless); + + sycl::event res_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>(sycl::range<1>{nelems}, + [=](sycl::id<1> id) { + auto i = id.get(0); + y[i] = a * x[i] + b * y[i]; + }); + }); + + return res_ev; +} + +// y = a * x + b * y +std::pair +axbpy_inplace(sycl::queue q, + py::object a, + dpctl::tensor::usm_ndarray x, + py::object b, + dpctl::tensor::usm_ndarray y, + const std::vector &depends = {}) +{ + + if (x.get_ndim() != 1 || y.get_ndim() != 1) { + throw std::runtime_error("Inconsistent dimensions, expecting vectors"); + } + + py::ssize_t n = x.get_shape(0); // get length of the vector + + if (n != y.get_shape(0)) { + throw std::runtime_error("Vectors must have the same length"); + } + + int x_flags = x.get_flags(); + int y_flags = y.get_flags(); + + if (!((x_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) && + (y_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)))) + { + throw std::runtime_error("Vectors must be contiguous."); + } + + int x_typenum = x.get_typenum(); + int y_typenum = y.get_typenum(); + + if ((x_typenum != y_typenum) || + !((x_typenum == UAR_DOUBLE) || (x_typenum == UAR_FLOAT) || + (x_typenum == UAR_CDOUBLE) || (x_typenum == UAR_CFLOAT))) + { + throw std::runtime_error( + "Only real and complex floating point arrays are supported."); + } + + const char *x_typeless_ptr = x.get_data(); + char *y_typeless_ptr = y.get_data(); + + sycl::event res_ev; + if (x_typenum == UAR_DOUBLE) { + using T = double; + res_ev = axbpy_inplace_impl(q, n, a, x_typeless_ptr, b, + y_typeless_ptr, depends); + } + else if (x_typenum == UAR_FLOAT) { + using T = float; + res_ev = axbpy_inplace_impl(q, n, a, x_typeless_ptr, b, + y_typeless_ptr, depends); + } + else if (x_typenum == UAR_CDOUBLE) { + using T = std::complex; + res_ev = axbpy_inplace_impl(q, n, a, x_typeless_ptr, b, + y_typeless_ptr, depends); + } + else if (x_typenum == UAR_CFLOAT) { + using T = std::complex; + res_ev = axbpy_inplace_impl(q, n, a, x_typeless_ptr, b, + y_typeless_ptr, depends); + } + else { + throw std::runtime_error("Type dispatch ran into trouble."); + } + + sycl::event ht_event = keep_args_alive(q, {x, y}, {res_ev}); + + return std::make_pair(ht_event, res_ev); +} + +template class norm_squared_blocking_kern; + +template +T norm_squared_blocking_impl(sycl::queue q, + size_t nelems, + const char *r_typeless, + const std::vector &depends = {}) +{ + const T *r = reinterpret_cast(r_typeless); + + sycl::buffer sum_sq_buf(sycl::range<1>{1}); + + q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + auto sum_sq_reduction = sycl::reduction( + sum_sq_buf, cgh, sycl::plus(), + {sycl::property::reduction::initialize_to_identity{}}); + cgh.parallel_for>( + sycl::range<1>{nelems}, sum_sq_reduction, + [=](sycl::id<1> id, auto &sum_sq) { + auto i = id.get(0); + sum_sq += r[i] * r[i]; + }); + }).wait_and_throw(); + + sycl::host_accessor ha(sum_sq_buf); + return ha[0]; +} + +py::object norm_squared_blocking(sycl::queue q, + dpctl::tensor::usm_ndarray r, + const std::vector depends = {}) +{ + if (r.get_ndim() != 1) { + throw std::runtime_error("Expecting a vector"); + } + + py::ssize_t n = r.get_shape(0); // get length of the vector + + int r_flags = r.get_flags(); + + if (!(r_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))) { + throw std::runtime_error("Vector must be contiguous."); + } + + int r_typenum = r.get_typenum(); + if ((r_typenum != UAR_DOUBLE) && (r_typenum != UAR_FLOAT) && + (r_typenum != UAR_CDOUBLE) && (r_typenum != UAR_CFLOAT)) + { + throw std::runtime_error( + "Only real and complex floating point arrays are supported."); + } + + const char *r_typeless_ptr = r.get_data(); + py::object res; + + if (r_typenum == UAR_DOUBLE) { + using T = double; + T n_sq = norm_squared_blocking_impl(q, n, r_typeless_ptr, depends); + res = py::float_(n_sq); + } + else if (r_typenum == UAR_FLOAT) { + using T = float; + T n_sq = norm_squared_blocking_impl(q, n, r_typeless_ptr, depends); + res = py::float_(n_sq); + } +#if 0 + else if (r_typenum == UAR_CDOUBLE) { + using T = std::complex; + double n_sq = norm_squared_blocking_impl( + q, n, r_typeless_ptr, + depends); + res = py::float_(n_sq); + } + else if (r_typenum == UAR_CFLOAT) { + using T = std::complex; + float n_sq = norm_squared_blocking_impl( + q, n, r_typeless_ptr, + depends); + res = py::float_(n_sq); + } +#endif + else { + throw std::runtime_error("Type dispatch ran into trouble."); + } + + return res; +} + PYBIND11_MODULE(_onemkl, m) { // Import the dpctl extensions import_dpctl(); - m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)"); + m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)", + py::arg("exec_queue"), py::arg("Amatrix"), py::arg("xvec"), + py::arg("resvec"), py::arg("depends") = py::list()); + m.def("sub", &sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"), + py::arg("in1"), py::arg("in2"), py::arg("out"), + py::arg("depends") = py::list()); + m.def("axbpy_inplace", &axbpy_inplace, "y = a * x + b * y", + py::arg("exec_queue"), py::arg("a"), py::arg("x"), py::arg("b"), + py::arg("y"), py::arg("depends") = py::list()); + m.def("norm_squared_blocking", &norm_squared_blocking, "norm(r)**2", + py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list()); } From c4ad438086eb553265581e522c04fb57171cbd01 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 25 Apr 2022 10:25:50 -0500 Subject: [PATCH 02/11] Implemented conjugate gradient linear solver --- examples/pybind11/onemkl_gemv/chebyshev.py | 128 ---------- examples/pybind11/onemkl_gemv/solve.py | 229 ++++++++++++++++++ .../onemkl_gemv/sycl_gemm/__init__.py | 16 +- .../onemkl_gemv/sycl_gemm/_onemkl.cpp | 137 ++++++++++- 4 files changed, 370 insertions(+), 140 deletions(-) delete mode 100644 examples/pybind11/onemkl_gemv/chebyshev.py create mode 100644 examples/pybind11/onemkl_gemv/solve.py diff --git a/examples/pybind11/onemkl_gemv/chebyshev.py b/examples/pybind11/onemkl_gemv/chebyshev.py deleted file mode 100644 index 36cf4fbbf0..0000000000 --- a/examples/pybind11/onemkl_gemv/chebyshev.py +++ /dev/null @@ -1,128 +0,0 @@ -import numpy as np -import sycl_gemm - -import dpctl -import dpctl.tensor as dpt - - -def empty_like(A): - return dpt.empty(A.shape, A.dtype, device=A.device) - - -def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]): - """Chebyshev iterative solver using SYCL routines""" - d = (lMax + lMin) / 2 - c = (lMax - lMin) / 2 - - x = dpt.copy(x0) - exec_queue = A.sycl_queue - assert exec_queue == x.sycl_queue - Ax = empty_like(A[:, 0]) - r = empty_like(Ax) - p = empty_like(Ax) - - e_x = dpctl.SyclEvent() - he_dot, e_dot = sycl_gemm.gemv( - exec_queue, A, x, Ax, depends=depends - ) # Ax = A @ x - he_sub, e_sub = sycl_gemm.sub( - exec_queue, b, Ax, r, depends=[e_dot] - ) # r = b - Ax - r_ev = e_sub - for i in range(nIters): - z = r - z_ev = r_ev - if i == 0: - p[:] = z - alpha = 1 / d - he_axbpy, e_axbpy = dpctl.SyclEvent(), dpctl.SyclEvent() - elif i == 1: - beta = 0.5 * (c * alpha) ** 2 - alpha = 1 / (d - beta / alpha) - he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace( - exec_queue, 1, z, beta, p, depends=[z_ev] - ) # p = z + beta * p - else: - beta = (c / 2 * alpha) ** 2 - alpha = 1 / (d - beta / alpha) - he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace( - exec_queue, 1, z, beta, p, depends=[z_ev] - ) # p = z + beta * p - h_x, e_x = sycl_gemm.axbpy_inplace( - exec_queue, alpha, p, 1, x, depends=[e_axbpy, e_x] - ) # x = x + alpha * p - he_dot, e_dot = sycl_gemm.gemv( - exec_queue, A, x, Ax, depends=[e_x] - ) # Ax = A @ x - he_sub, e_sub = sycl_gemm.sub( - exec_queue, b, Ax, r, depends=[e_dot] - ) # r = b - Ax - residual = sycl_gemm.norm_squared_blocking( - exec_queue, r, depends=[e_sub] - ) # residual = dot(r, r) - if residual <= 1e-29: - print(f"chebyshev: converged in {i} iters") - break - return x - - -def check_with_numpy(A, b): - """Direct solver using numpy""" - import numpy as np - - Anp = dpt.asnumpy(A) - bnp = dpt.asnumpy(b) - return np.linalg.solve(Anp, bnp) - - -def chebyshev_numpy(A, b, x0, nIters, lMax, lMin): - """Chebyshev iterative solver using numpy""" - d = (lMax + lMin) / 2 - c = (lMax - lMin) / 2 - - x = x0 - - Ax = A @ x - r = b - Ax - for i in range(nIters): - z = r - if i == 0: - p = z - alpha = 1 / d - elif i == 1: - beta = 0.5 * (c * alpha) ** 2 - alpha = 1 / (d - beta / alpha) - p = z + beta * p - else: - beta = (c / 2 * alpha) ** 2 - alpha = 1 / (d - beta / alpha) - p = z + beta * p - x += alpha * p - Ax = A @ x - r = b - Ax - residual = np.dot(r, r) - if residual <= 1e-29: - print(f"chebyshev_numpy: converged in {i} iters") - break - return x - - -if __name__ == "__main__": - A = dpt.asarray( - [[2, 1, 0, 0], [1, 2, 1, 0], [0, 1, 2, 1], [0, 0, 1, 2]], - dtype="d", - usm_type="device", - ) - dev = A.device - b = dpt.asarray( - [0.5, 0.7, 0.3, 0.1], dtype="d", usm_type="device", device=dev - ) - x0 = b - x = chebyshev(A, b, x0, 100, 4.0, 0.25) - print(dpt.asnumpy(x)) - print(check_with_numpy(A, b)) - print( - chebyshev_numpy( - dpt.asnumpy(A), dpt.asnumpy(b), dpt.asnumpy(b), 100, 4.0, 0.25 - ) - ) diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py new file mode 100644 index 0000000000..1340556770 --- /dev/null +++ b/examples/pybind11/onemkl_gemv/solve.py @@ -0,0 +1,229 @@ +import numpy as np +import sycl_gemm + +import dpctl +import dpctl.tensor as dpt + + +def empty_like(A): + return dpt.empty(A.shape, A.dtype, device=A.device) + + +def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]): + """Chebyshev iterative solver using SYCL routines""" + d = (lMax + lMin) / 2 + c = (lMax - lMin) / 2 + + x = dpt.copy(x0) + exec_queue = A.sycl_queue + assert exec_queue == x.sycl_queue + Ax = empty_like(A[:, 0]) + r = empty_like(Ax) + p = empty_like(Ax) + + e_x = dpctl.SyclEvent() + he_dot, e_dot = sycl_gemm.gemv( + exec_queue, A, x, Ax, depends=depends + ) # Ax = A @ x + he_sub, e_sub = sycl_gemm.sub( + exec_queue, b, Ax, r, depends=[e_dot] + ) # r = b - Ax + r_ev = e_sub + for i in range(nIters): + z = r + z_ev = r_ev + if i == 0: + p[:] = z + alpha = 1 / d + he_axbpy, e_axbpy = dpctl.SyclEvent(), dpctl.SyclEvent() + elif i == 1: + beta = 0.5 * (c * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace( + exec_queue, 1, z, beta, p, depends=[z_ev] + ) # p = z + beta * p + else: + beta = (c / 2 * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace( + exec_queue, 1, z, beta, p, depends=[z_ev] + ) # p = z + beta * p + h_x, e_x = sycl_gemm.axbpy_inplace( + exec_queue, alpha, p, 1, x, depends=[e_axbpy, e_x] + ) # x = x + alpha * p + he_dot, e_dot = sycl_gemm.gemv( + exec_queue, A, x, Ax, depends=[e_x] + ) # Ax = A @ x + he_sub, e_sub = sycl_gemm.sub( + exec_queue, b, Ax, r, depends=[e_dot] + ) # r = b - Ax + residual = sycl_gemm.norm_squared_blocking( + exec_queue, r, depends=[e_sub] + ) # residual = dot(r, r) + if residual <= 1e-29: + print(f"chebyshev: converged in {i} iters") + break + exec_queue.wait() # wait for all host tasks to complete + return x + + +def check_with_numpy(A, b): + """Direct solver using numpy""" + import numpy as np + + return np.linalg.solve(Anp, bnp) + + +def chebyshev_numpy(A, b, x0, nIters, lMax, lMin): + """Chebyshev iterative solver using numpy""" + d = (lMax + lMin) / 2 + c = (lMax - lMin) / 2 + + x = x0 + + Ax = np.dot(A, x) + r = b - Ax + for i in range(nIters): + z = r + if i == 0: + p = z + alpha = 1 / d + elif i == 1: + beta = 0.5 * (c * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + p = z + beta * p + else: + beta = (c / 2 * alpha) ** 2 + alpha = 1 / (d - beta / alpha) + p = z + beta * p + x = x + alpha * p + Ax = np.dot(A, x) + r = b - Ax + residual = np.dot(r, r) + if residual <= 1e-29: + print(f"chebyshev_numpy: converged in {i} iters") + break + return x + + +def cg_solve(A, b): + """ + Conjugate gradient solver for A @ x == b. + + Returns tuple: (x, converged) + + converged is False if solver has not converged, or the iteration number + """ + exec_queue = A.sycl_queue + x = dpt.zeros(b.shape, dtype=b.dtype) + Ap = empty_like(x) + + all_host_tasks = [] + r = dpt.copy(b) + p = dpt.copy(b) + rsold = sycl_gemm.norm_squared_blocking(exec_queue, r) + if rsold < 1e-20: + return (b, 0) + converged = False + max_iters = b.shape[0] + + e_p = dpctl.SyclEvent() + e_x = dpctl.SyclEvent() + for i in range(max_iters): + he_dot, e_dot = sycl_gemm.gemv( + exec_queue, A, p, Ap, depends=[e_p] + ) # Ap = A @ p + all_host_tasks.append(he_dot) + alpha = rsold / sycl_gemm.dot_blocking( # alpha = rsold / dot(p, Ap) + exec_queue, p, Ap, depends=[e_dot] + ) + he1_axbpy, e1_axbpy = sycl_gemm.axbpy_inplace( + exec_queue, alpha, p, 1, x, depends=[e_p, e_x] + ) # x = x + alpha * p + all_host_tasks.append(he1_axbpy) + e_x = e1_axbpy + + he2_axbpy, e2_axbpy = sycl_gemm.axbpy_inplace( + exec_queue, -alpha, Ap, 1, r, depends=[e_p] + ) # r = r - alpha * Ap + all_host_tasks.append(he2_axbpy) + + rsnew = sycl_gemm.norm_squared_blocking( + exec_queue, r, depends=[e2_axbpy] + ) + if rsnew < 1e-20: + e1_axbpy.wait() + converged = i + break + beta = rsnew / rsold + + he3_axbpy, e3_axbpy = sycl_gemm.axbpy_inplace( + exec_queue, 1, r, beta, p, depends=[e1_axbpy, e2_axbpy] + ) # p = r + beta * p + + rsold = rsnew + all_host_tasks.append(he3_axbpy) + e_p = e3_axbpy + + dpctl.SyclEvent.wait_for(all_host_tasks) + return x, converged + + +def cg_solve_numpy(A, b): + x = np.zeros_like(b) + r = b - np.dot(A, x) + p = r + rsold = np.dot(r, r) + converged = False + max_iters = b.shape[0] + + for i in range(max_iters): + Ap = np.dot(A, p) + alpha = rsold / np.dot(p, Ap) + x = x + alpha * p + r = r - alpha * Ap + rsnew = np.dot(r, r) + + if rsnew < 1e-20: + converged = i + break + + beta = rsnew / rsold + p = r + beta * p + rsold = rsnew + + return (x, converged) + + +if __name__ == "__main__": + n = 32 + lambda_max = 4 + lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2)))) + # eigenvalues of cartan matrix are + Anp = ( + 2 * np.eye(n, n, k=0, dtype="d") + + np.eye(n, n, k=1, dtype="d") + + np.eye(n, n, k=-1, dtype="d") + ) + bnp = np.geomspace(0.5, 2, n, dtype="d") + + q = dpctl.SyclQueue(property="enable_profiling") + A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q) + dev = A.device + b = dpt.asarray(bnp, dtype="d", usm_type="device", device=dev) + x0 = b + t = dpctl.SyclTimer() + with t(dev.sycl_queue): + x, conv = cg_solve(A, b) + print((conv, t.dt)) + with t(dev.sycl_queue): + x, conv = cg_solve(A, b) + print(dpt.asnumpy(x)) + print((conv, t.dt)) + with t(dev.sycl_queue): + print(check_with_numpy(Anp, bnp)) + print(t.dt) + with t(dev.sycl_queue): + x_np, conv = cg_solve_numpy(dpt.asnumpy(A), dpt.asnumpy(b)) + print(x_np, conv) + print(t.dt) diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py index df8b5129fa..14a8c0dec8 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py @@ -1,3 +1,15 @@ -from ._onemkl import axbpy_inplace, gemv, norm_squared_blocking, sub +from ._onemkl import ( + axbpy_inplace, + dot_blocking, + gemv, + norm_squared_blocking, + sub, +) -__all__ = ["gemv", "sub", "axbpy_inplace", "norm_squared_blocking"] +__all__ = [ + "gemv", + "sub", + "axbpy_inplace", + "norm_squared_blocking", + "dot_blocking", +] diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 1fb7a951d6..05590565e2 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -343,6 +343,38 @@ T norm_squared_blocking_impl(sycl::queue q, return ha[0]; } +template class complex_norm_squared_blocking_kern; + +template +T complex_norm_squared_blocking_impl( + sycl::queue q, + size_t nelems, + const char *r_typeless, + const std::vector &depends = {}) +{ + const std::complex *r = + reinterpret_cast *>(r_typeless); + + sycl::buffer sum_sq_buf(sycl::range<1>{1}); + + q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + auto sum_sq_reduction = sycl::reduction( + sum_sq_buf, cgh, sycl::plus(), + {sycl::property::reduction::initialize_to_identity{}}); + cgh.parallel_for>( + sycl::range<1>{nelems}, sum_sq_reduction, + [=](sycl::id<1> id, auto &sum_sq) { + auto i = id.get(0); + sum_sq += + r[i].real() * r[i].real() + r[i].imag() * r[i].imag(); + }); + }).wait_and_throw(); + + sycl::host_accessor ha(sum_sq_buf); + return ha[0]; +} + py::object norm_squared_blocking(sycl::queue q, dpctl::tensor::usm_ndarray r, const std::vector depends = {}) @@ -380,22 +412,105 @@ py::object norm_squared_blocking(sycl::queue q, T n_sq = norm_squared_blocking_impl(q, n, r_typeless_ptr, depends); res = py::float_(n_sq); } -#if 0 else if (r_typenum == UAR_CDOUBLE) { using T = std::complex; - double n_sq = norm_squared_blocking_impl( - q, n, r_typeless_ptr, - depends); - res = py::float_(n_sq); + double n_sq = complex_norm_squared_blocking_impl( + q, n, r_typeless_ptr, depends); + res = py::float_(n_sq); } else if (r_typenum == UAR_CFLOAT) { using T = std::complex; - float n_sq = norm_squared_blocking_impl( - q, n, r_typeless_ptr, - depends); - res = py::float_(n_sq); + float n_sq = complex_norm_squared_blocking_impl( + q, n, r_typeless_ptr, depends); + res = py::float_(n_sq); + } + else { + throw std::runtime_error("Type dispatch ran into trouble."); + } + + return res; +} + +py::object dot_blocking(sycl::queue q, + dpctl::tensor::usm_ndarray v1, + dpctl::tensor::usm_ndarray v2, + const std::vector &depends = {}) +{ + if (v1.get_ndim() != 1 || v2.get_ndim() != 1) { + throw std::runtime_error("Expecting two vectors"); + } + + py::ssize_t n = v1.get_shape(0); // get length of the vector + + if (n != v2.get_shape(0)) { + throw std::runtime_error("Length of vectors are not the same"); + } + + int v1_flags = v1.get_flags(); + int v2_flags = v2.get_flags(); + + if (!(v1_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)) || + !(v2_flags & (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS))) + { + throw std::runtime_error("Vectors must be contiguous."); + } + + int v1_typenum = v1.get_typenum(); + int v2_typenum = v2.get_typenum(); + + if ((v1_typenum != v2_typenum) || + ((v1_typenum != UAR_DOUBLE) && (v1_typenum != UAR_FLOAT) && + (v1_typenum != UAR_CDOUBLE) && (v1_typenum != UAR_CFLOAT))) + { + throw py::value_error( + "Data types of vectors must be the same. " + "Only real and complex floating types are supported."); + } + + const char *v1_typeless_ptr = v1.get_data(); + const char *v2_typeless_ptr = v2.get_data(); + py::object res; + + if (v1_typenum == UAR_DOUBLE) { + using T = double; + T *res_usm = sycl::malloc_device(1, q); + sycl::event dot_ev = oneapi::mkl::blas::row_major::dot( + q, n, reinterpret_cast(v1_typeless_ptr), 1, + reinterpret_cast(v2_typeless_ptr), 1, res_usm, depends); + T res_v{}; + q.copy(res_usm, &res_v, 1, {dot_ev}).wait_and_throw(); + res = py::float_(res_v); + } + else if (v1_typenum == UAR_FLOAT) { + using T = float; + T *res_usm = sycl::malloc_device(1, q); + sycl::event dot_ev = oneapi::mkl::blas::row_major::dot( + q, n, reinterpret_cast(v1_typeless_ptr), 1, + reinterpret_cast(v2_typeless_ptr), 1, res_usm, depends); + T res_v(0); + q.copy(res_usm, &res_v, 1, {dot_ev}).wait_and_throw(); + res = py::float_(res_v); + } + else if (v1_typenum == UAR_CDOUBLE) { + using T = std::complex; + T *res_usm = sycl::malloc_device(1, q); + sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc( + q, n, reinterpret_cast(v1_typeless_ptr), 1, + reinterpret_cast(v2_typeless_ptr), 1, res_usm, depends); + T res_v{}; + q.copy(res_usm, &res_v, 1, {dotc_ev}).wait_and_throw(); + res = py::cast(res_v); + } + else if (v1_typenum == UAR_CFLOAT) { + using T = std::complex; + T *res_usm = sycl::malloc_device(1, q); + sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc( + q, n, reinterpret_cast(v1_typeless_ptr), 1, + reinterpret_cast(v2_typeless_ptr), 1, res_usm, depends); + T res_v{}; + q.copy(res_usm, &res_v, 1, {dotc_ev}).wait_and_throw(); + res = py::cast(res_v); } -#endif else { throw std::runtime_error("Type dispatch ran into trouble."); } @@ -418,4 +533,6 @@ PYBIND11_MODULE(_onemkl, m) py::arg("y"), py::arg("depends") = py::list()); m.def("norm_squared_blocking", &norm_squared_blocking, "norm(r)**2", py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list()); + m.def("dot_blocking", &dot_blocking, "", py::arg("exec_queue"), + py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list()); } From 49cee5cccb0d2f1d2050edaea260d60ceddd90e2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 25 Apr 2022 10:51:20 -0500 Subject: [PATCH 03/11] Renamed function: axpby for a*x plus b*y. Added tests for each function. --- examples/pybind11/onemkl_gemv/solve.py | 32 +++++----- .../onemkl_gemv/sycl_gemm/__init__.py | 4 +- .../onemkl_gemv/sycl_gemm/_onemkl.cpp | 18 +++--- .../pybind11/onemkl_gemv/tests/test_gemm.py | 62 ++++++++++++++++++- 4 files changed, 87 insertions(+), 29 deletions(-) diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py index 1340556770..b12681ab35 100644 --- a/examples/pybind11/onemkl_gemv/solve.py +++ b/examples/pybind11/onemkl_gemv/solve.py @@ -35,21 +35,21 @@ def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]): if i == 0: p[:] = z alpha = 1 / d - he_axbpy, e_axbpy = dpctl.SyclEvent(), dpctl.SyclEvent() + he_axpby, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent() elif i == 1: beta = 0.5 * (c * alpha) ** 2 alpha = 1 / (d - beta / alpha) - he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace( + he_axpby, e_axpby = sycl_gemm.axpby_inplace( exec_queue, 1, z, beta, p, depends=[z_ev] ) # p = z + beta * p else: beta = (c / 2 * alpha) ** 2 alpha = 1 / (d - beta / alpha) - he_axbpy, e_axbpy = sycl_gemm.axbpy_inplace( + he_axpby, e_axpby = sycl_gemm.axpby_inplace( exec_queue, 1, z, beta, p, depends=[z_ev] ) # p = z + beta * p - h_x, e_x = sycl_gemm.axbpy_inplace( - exec_queue, alpha, p, 1, x, depends=[e_axbpy, e_x] + h_x, e_x = sycl_gemm.axpby_inplace( + exec_queue, alpha, p, 1, x, depends=[e_axpby, e_x] ) # x = x + alpha * p he_dot, e_dot = sycl_gemm.gemv( exec_queue, A, x, Ax, depends=[e_x] @@ -137,33 +137,33 @@ def cg_solve(A, b): alpha = rsold / sycl_gemm.dot_blocking( # alpha = rsold / dot(p, Ap) exec_queue, p, Ap, depends=[e_dot] ) - he1_axbpy, e1_axbpy = sycl_gemm.axbpy_inplace( + he1_axpby, e1_axpby = sycl_gemm.axpby_inplace( exec_queue, alpha, p, 1, x, depends=[e_p, e_x] ) # x = x + alpha * p - all_host_tasks.append(he1_axbpy) - e_x = e1_axbpy + all_host_tasks.append(he1_axpby) + e_x = e1_axpby - he2_axbpy, e2_axbpy = sycl_gemm.axbpy_inplace( + he2_axpby, e2_axpby = sycl_gemm.axpby_inplace( exec_queue, -alpha, Ap, 1, r, depends=[e_p] ) # r = r - alpha * Ap - all_host_tasks.append(he2_axbpy) + all_host_tasks.append(he2_axpby) rsnew = sycl_gemm.norm_squared_blocking( - exec_queue, r, depends=[e2_axbpy] + exec_queue, r, depends=[e2_axpby] ) if rsnew < 1e-20: - e1_axbpy.wait() + e1_axpby.wait() converged = i break beta = rsnew / rsold - he3_axbpy, e3_axbpy = sycl_gemm.axbpy_inplace( - exec_queue, 1, r, beta, p, depends=[e1_axbpy, e2_axbpy] + he3_axpby, e3_axpby = sycl_gemm.axpby_inplace( + exec_queue, 1, r, beta, p, depends=[e1_axpby, e2_axpby] ) # p = r + beta * p rsold = rsnew - all_host_tasks.append(he3_axbpy) - e_p = e3_axbpy + all_host_tasks.append(he3_axpby) + e_p = e3_axpby dpctl.SyclEvent.wait_for(all_host_tasks) return x, converged diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py index 14a8c0dec8..45af05e21b 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py @@ -1,5 +1,5 @@ from ._onemkl import ( - axbpy_inplace, + axpby_inplace, dot_blocking, gemv, norm_squared_blocking, @@ -9,7 +9,7 @@ __all__ = [ "gemv", "sub", - "axbpy_inplace", + "axpby_inplace", "norm_squared_blocking", "dot_blocking", ] diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 05590565e2..84d33bc91e 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -212,10 +212,10 @@ sub(sycl::queue q, return std::make_pair(ht_event, res_ev); } -template class axbpy_inplace_kern; +template class axpby_inplace_kern; template -sycl::event axbpy_inplace_impl(sycl::queue q, +sycl::event axpby_inplace_impl(sycl::queue q, size_t nelems, py::object pyobj_a, const char *x_typeless, @@ -231,7 +231,7 @@ sycl::event axbpy_inplace_impl(sycl::queue q, sycl::event res_ev = q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - cgh.parallel_for>(sycl::range<1>{nelems}, + cgh.parallel_for>(sycl::range<1>{nelems}, [=](sycl::id<1> id) { auto i = id.get(0); y[i] = a * x[i] + b * y[i]; @@ -243,7 +243,7 @@ sycl::event axbpy_inplace_impl(sycl::queue q, // y = a * x + b * y std::pair -axbpy_inplace(sycl::queue q, +axpby_inplace(sycl::queue q, py::object a, dpctl::tensor::usm_ndarray x, py::object b, @@ -287,22 +287,22 @@ axbpy_inplace(sycl::queue q, sycl::event res_ev; if (x_typenum == UAR_DOUBLE) { using T = double; - res_ev = axbpy_inplace_impl(q, n, a, x_typeless_ptr, b, + res_ev = axpby_inplace_impl(q, n, a, x_typeless_ptr, b, y_typeless_ptr, depends); } else if (x_typenum == UAR_FLOAT) { using T = float; - res_ev = axbpy_inplace_impl(q, n, a, x_typeless_ptr, b, + res_ev = axpby_inplace_impl(q, n, a, x_typeless_ptr, b, y_typeless_ptr, depends); } else if (x_typenum == UAR_CDOUBLE) { using T = std::complex; - res_ev = axbpy_inplace_impl(q, n, a, x_typeless_ptr, b, + res_ev = axpby_inplace_impl(q, n, a, x_typeless_ptr, b, y_typeless_ptr, depends); } else if (x_typenum == UAR_CFLOAT) { using T = std::complex; - res_ev = axbpy_inplace_impl(q, n, a, x_typeless_ptr, b, + res_ev = axpby_inplace_impl(q, n, a, x_typeless_ptr, b, y_typeless_ptr, depends); } else { @@ -528,7 +528,7 @@ PYBIND11_MODULE(_onemkl, m) m.def("sub", &sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"), py::arg("in1"), py::arg("in2"), py::arg("out"), py::arg("depends") = py::list()); - m.def("axbpy_inplace", &axbpy_inplace, "y = a * x + b * y", + m.def("axpby_inplace", &axpby_inplace, "y = a * x + b * y", py::arg("exec_queue"), py::arg("a"), py::arg("x"), py::arg("b"), py::arg("y"), py::arg("depends") = py::list()); m.def("norm_squared_blocking", &norm_squared_blocking, "norm(r)**2", diff --git a/examples/pybind11/onemkl_gemv/tests/test_gemm.py b/examples/pybind11/onemkl_gemv/tests/test_gemm.py index 7a78fba346..6090322c75 100644 --- a/examples/pybind11/onemkl_gemv/tests/test_gemm.py +++ b/examples/pybind11/onemkl_gemv/tests/test_gemm.py @@ -1,6 +1,12 @@ import numpy as np import pytest -from sycl_gemm import gemv +from sycl_gemm import ( + axpby_inplace, + dot_blocking, + gemv, + norm_squared_blocking, + sub, +) import dpctl import dpctl.tensor as dpt @@ -15,7 +21,59 @@ def test_gemv(): r = dpt.empty((5,), dtype="d", sycl_queue=q) M = dpt.asarray(Mnp, sycl_queue=q) v = dpt.asarray(vnp, sycl_queue=q) - hev, ev = gemv(M.sycl_queue, M, v, r, []) + hev, ev = gemv(q, M, v, r, []) hev.wait() rnp = dpt.asnumpy(r) assert np.allclose(rnp, Mnp @ vnp) + + +def test_sub(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + anp, bnp = np.random.randn(5), np.random.randn(5) + r = dpt.empty((5,), dtype="d", sycl_queue=q) + a = dpt.asarray(anp, sycl_queue=q) + b = dpt.asarray(bnp, sycl_queue=q) + hev, ev = sub(q, a, b, r, []) + hev.wait() + rnp = dpt.asnumpy(r) + assert np.allclose(rnp + bnp, anp) + + +def test_axpby(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + xnp, pnp = np.random.randn(5), np.random.randn(5) + x = dpt.asarray(xnp, sycl_queue=q) + p = dpt.asarray(pnp, sycl_queue=q) + hev, ev = axpby_inplace(q, 0.5, x, -0.7, p, []) + hev.wait() + rnp = dpt.asnumpy(p) + assert np.allclose(rnp, 0.5 * xnp - 0.7 * pnp) + + +def test_dot(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + anp, bnp = np.random.randn(5), np.random.randn(5) + a = dpt.asarray(anp, sycl_queue=q) + b = dpt.asarray(bnp, sycl_queue=q) + dot_res = dot_blocking(q, a, b) + assert np.allclose(dot_res, np.dot(anp, bnp)) + + +def test_norm_squared(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Queue could not be created") + anp = np.random.randn(5) + a = dpt.asarray(anp, sycl_queue=q) + dot_res = norm_squared_blocking(q, a) + assert np.allclose(dot_res, np.dot(anp, anp)) From 06db2583c8640fbf4d0ea0ea1e1845932029d526 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 25 Apr 2022 13:01:18 -0500 Subject: [PATCH 04/11] Used scikit-build for all pybind11 examples, added tests to all. --- .../external_usm_allocation/CMakeLists.txt | 36 +++++++++++++++++ .../external_usm_allocation/README.md | 3 +- .../external_usm_allocation/example.py | 2 +- .../external_usm_allocation/__init__.py | 3 ++ .../_usm_alloc_example.cpp | 2 +- .../pybind11/external_usm_allocation/setup.py | 26 +++++-------- .../tests/test_dmatrix.py | 30 ++++++++++++++ .../use_dpctl_syclqueue/CMakeLists.txt | 36 +++++++++++++++++ .../pybind11/use_dpctl_syclqueue/README.md | 3 +- .../pybind11/use_dpctl_syclqueue/example.py | 2 +- .../pybind11/use_dpctl_syclqueue/setup.py | 25 +++++------- .../tests/test_queue_device.py | 39 +++++++++++++++++++ .../use_queue_device/__init__.py | 13 +++++++ .../{ => use_queue_device}/_example.cpp | 2 +- 14 files changed, 183 insertions(+), 39 deletions(-) create mode 100644 examples/pybind11/external_usm_allocation/CMakeLists.txt create mode 100644 examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py rename examples/pybind11/external_usm_allocation/{ => external_usm_allocation}/_usm_alloc_example.cpp (99%) create mode 100644 examples/pybind11/external_usm_allocation/tests/test_dmatrix.py create mode 100644 examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt create mode 100644 examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py create mode 100644 examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py rename examples/pybind11/use_dpctl_syclqueue/{ => use_queue_device}/_example.cpp (98%) diff --git a/examples/pybind11/external_usm_allocation/CMakeLists.txt b/examples/pybind11/external_usm_allocation/CMakeLists.txt new file mode 100644 index 0000000000..6f42a3e6d3 --- /dev/null +++ b/examples/pybind11/external_usm_allocation/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 3.21) + +project(external_usm_allocation LANGUAGES CXX) + +set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) +find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +# Fetch pybind11 +include(FetchContent) +FetchContent_Declare( + pybind11 + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz + URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1 +) +FetchContent_MakeAvailable(pybind11) + +find_package(PythonExtensions REQUIRED) +find_package(Dpctl REQUIRED) +find_package(NumPy REQUIRED) + +set(py_module_name _external_usm_alloc) +pybind11_add_module(${py_module_name} + MODULE + external_usm_allocation/_usm_alloc_example.cpp +) +target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) +target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations) +install(TARGETS ${py_module_name} + DESTINATION external_usm_allocation +) + +set(ignoreMe "${SKBUILD}") diff --git a/examples/pybind11/external_usm_allocation/README.md b/examples/pybind11/external_usm_allocation/README.md index 7431ba4ab6..0b8dd62dc2 100644 --- a/examples/pybind11/external_usm_allocation/README.md +++ b/examples/pybind11/external_usm_allocation/README.md @@ -10,6 +10,7 @@ to dpctl.memory entities using `__sycl_usm_array_interface__`. ``` source /opt/intel/oneapi/compiler/latest/env/vars.sh CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace +python -m pytest tests python example.py ``` @@ -17,7 +18,7 @@ python example.py ``` (idp) [12:43:20 ansatnuc04 external_usm_allocation]$ python example.py - + {'data': [94846745444352, True], 'shape': (5, 5), 'strides': None, 'version': 1, 'typestr': '|f8', 'syclobj': } shared diff --git a/examples/pybind11/external_usm_allocation/example.py b/examples/pybind11/external_usm_allocation/example.py index 4455616146..6001d87eca 100644 --- a/examples/pybind11/external_usm_allocation/example.py +++ b/examples/pybind11/external_usm_allocation/example.py @@ -16,7 +16,7 @@ # coding: utf-8 -import external_usm_alloc as eua +import external_usm_allocation as eua import numpy as np import dpctl diff --git a/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py new file mode 100644 index 0000000000..c4a46c18e6 --- /dev/null +++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py @@ -0,0 +1,3 @@ +from ._external_usm_alloc import DMatrix + +__all__ = ["DMatrix"] diff --git a/examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp similarity index 99% rename from examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp rename to examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp index 30216dac55..2b8212076b 100644 --- a/examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp +++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp @@ -132,7 +132,7 @@ py::list tolist(DMatrix &m) return rows; } -PYBIND11_MODULE(external_usm_alloc, m) +PYBIND11_MODULE(_external_usm_alloc, m) { // Import the dpctl extensions import_dpctl(); diff --git a/examples/pybind11/external_usm_allocation/setup.py b/examples/pybind11/external_usm_allocation/setup.py index 2802075d41..86abd2854a 100644 --- a/examples/pybind11/external_usm_allocation/setup.py +++ b/examples/pybind11/external_usm_allocation/setup.py @@ -14,21 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pybind11.setup_helpers import Pybind11Extension -from setuptools import setup +from skbuild import setup -import dpctl - -ext_modules = [ - Pybind11Extension( - "external_usm_alloc", - ["./_usm_alloc_example.cpp"], - include_dirs=[dpctl.get_include()], - extra_compile_args=["-fPIC"], - extra_link_args=["-fPIC"], - libraries=["sycl"], - language="c++", - ) -] - -setup(name="external_usm_alloc", ext_modules=ext_modules) +setup( + name="external_usm_allocation", + version="0.0.1", + description="an example of SYCL-powered Python package (with pybind11)", + author="Intel Scripting", + license="Apache 2.0", + packages=["external_usm_allocation"], +) diff --git a/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py new file mode 100644 index 0000000000..0afe80eb12 --- /dev/null +++ b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py @@ -0,0 +1,30 @@ +import external_usm_allocation as eua +import numpy as np + +import dpctl +import dpctl.memory as dpm + + +def test_dmatrix(): + q = dpctl.SyclQueue() + matr = eua.DMatrix(q, 5, 5) + assert hasattr(matr, "__sycl_usm_array_interface__") + + blob = dpm.as_usm_memory(matr) + assert blob.get_usm_type() == "shared" + + Xh = np.array( + [ + [1, 1, 1, 2, 2], + [1, 0, 1, 2, 2], + [1, 1, 0, 2, 2], + [0, 0, 0, 3, -1], + [0, 0, 0, -1, 5], + ], + dtype="d", + ) + host_bytes_view = Xh.reshape((-1)).view(np.ubyte) + blob.copy_from_host(host_bytes_view) + + list_of_lists = matr.tolist() + assert list_of_lists == Xh.tolist() diff --git a/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt b/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt new file mode 100644 index 0000000000..a788c56bae --- /dev/null +++ b/examples/pybind11/use_dpctl_syclqueue/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 3.21) + +project(use_queue_device LANGUAGES CXX) + +set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) +find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +# Fetch pybind11 +include(FetchContent) +FetchContent_Declare( + pybind11 + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz + URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1 +) +FetchContent_MakeAvailable(pybind11) + +find_package(PythonExtensions REQUIRED) +find_package(Dpctl REQUIRED) +find_package(NumPy REQUIRED) + +set(py_module_name _use_queue_device) +pybind11_add_module(${py_module_name} + MODULE + use_queue_device/_example.cpp +) +target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) +target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations) +install(TARGETS ${py_module_name} + DESTINATION use_queue_device +) + +set(ignoreMe "${SKBUILD}") diff --git a/examples/pybind11/use_dpctl_syclqueue/README.md b/examples/pybind11/use_dpctl_syclqueue/README.md index 3e7b6804a9..f3ef6e92e5 100644 --- a/examples/pybind11/use_dpctl_syclqueue/README.md +++ b/examples/pybind11/use_dpctl_syclqueue/README.md @@ -9,7 +9,8 @@ extensions. ``` source /opt/intel/oneapi/compiler/latest/env/vars.sh -CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace +CXX=icpx python setup.py build_ext --inplace +python -m pytest tests python example.py ``` diff --git a/examples/pybind11/use_dpctl_syclqueue/example.py b/examples/pybind11/use_dpctl_syclqueue/example.py index 7189adecb8..698e43baf9 100644 --- a/examples/pybind11/use_dpctl_syclqueue/example.py +++ b/examples/pybind11/use_dpctl_syclqueue/example.py @@ -17,7 +17,7 @@ # coding: utf-8 import numpy as np -import use_queue_device_ext as eg +import use_queue_device as eg import dpctl diff --git a/examples/pybind11/use_dpctl_syclqueue/setup.py b/examples/pybind11/use_dpctl_syclqueue/setup.py index 34eeebe38c..4d09be820e 100644 --- a/examples/pybind11/use_dpctl_syclqueue/setup.py +++ b/examples/pybind11/use_dpctl_syclqueue/setup.py @@ -14,20 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pybind11.setup_helpers import Pybind11Extension -from setuptools import setup +from skbuild import setup -import dpctl - -exts = [ - Pybind11Extension( - "use_queue_device_ext", - ["./_example.cpp"], - include_dirs=[dpctl.get_include()], - extra_compile_args=["-fPIC"], - extra_link_args=["-fPIC"], - language="c++", - ), -] - -setup(name="pybind11_example", ext_modules=exts) +setup( + name="use_queue_device", + version="0.0.1", + description="an example of SYCL-powered Python package (with pybind11)", + author="Intel Scripting", + license="Apache 2.0", + packages=["use_queue_device"], +) diff --git a/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py new file mode 100644 index 0000000000..899da500d5 --- /dev/null +++ b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py @@ -0,0 +1,39 @@ +import numpy as np +import use_queue_device as uqd + +import dpctl + + +def test_compute_units(): + q = dpctl.SyclQueue() + mcu = uqd.get_max_compute_units(q) + + assert type(mcu) is int + assert mcu == q.sycl_device.max_compute_units + + +def test_global_memory(): + d = dpctl.SyclDevice() + gm = uqd.get_device_global_mem_size(d) + assert type(gm) is int + assert gm == d.global_mem_size + + +def test_local_memory(): + d = dpctl.SyclDevice() + lm = uqd.get_device_local_mem_size(d) + assert type(lm) is int + assert lm == d.local_mem_size + + +def test_offload_array_mod(): + execution_queue = dpctl.SyclQueue() + X = np.random.randint(low=1, high=2**16 - 1, size=10**6, dtype=np.int64) + modulus_p = 347 + + # Y is a regular NumPy array with NumPy allocated host memory + Y = uqd.offloaded_array_mod(execution_queue, X, modulus_p) + + Ynp = X % modulus_p + + assert np.array_equal(Y, Ynp) diff --git a/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py new file mode 100644 index 0000000000..83b48b9cd7 --- /dev/null +++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py @@ -0,0 +1,13 @@ +from ._use_queue_device import ( + get_device_global_mem_size, + get_device_local_mem_size, + get_max_compute_units, + offloaded_array_mod, +) + +__all__ = [ + "get_max_compute_units", + "get_device_global_mem_size", + "get_device_local_mem_size", + "offloaded_array_mod", +] diff --git a/examples/pybind11/use_dpctl_syclqueue/_example.cpp b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp similarity index 98% rename from examples/pybind11/use_dpctl_syclqueue/_example.cpp rename to examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp index 7b26a35858..54474a0ca1 100644 --- a/examples/pybind11/use_dpctl_syclqueue/_example.cpp +++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/_example.cpp @@ -84,7 +84,7 @@ offloaded_array_mod(sycl::queue &q, return res; } -PYBIND11_MODULE(use_queue_device_ext, m) +PYBIND11_MODULE(_use_queue_device, m) { // Import the dpctl extensions import_dpctl(); From 5822704c0574c9dd29e10b8277997fd6723f80b4 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 25 Apr 2022 13:23:26 -0500 Subject: [PATCH 05/11] Added license headers --- .../external_usm_allocation/__init__.py | 24 +++++++++++++++ .../tests/test_dmatrix.py | 18 ++++++++++++ .../tests/test_queue_device.py | 18 ++++++++++++ .../use_queue_device/__init__.py | 29 +++++++++++++++++++ 4 files changed, 89 insertions(+) diff --git a/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py index c4a46c18e6..1af16147fa 100644 --- a/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py +++ b/examples/pybind11/external_usm_allocation/external_usm_allocation/__init__.py @@ -1,3 +1,27 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding: utf-8 + from ._external_usm_alloc import DMatrix __all__ = ["DMatrix"] + +__doc__ = """ + Example of implementing C++ class with its own USM memory allocation logic +and interfacing that allocation with `dpctl` by implementing +`__sycl_usm_array_interface__`. +""" diff --git a/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py index 0afe80eb12..96bfb951e8 100644 --- a/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py +++ b/examples/pybind11/external_usm_allocation/tests/test_dmatrix.py @@ -1,3 +1,21 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding: utf-8 + import external_usm_allocation as eua import numpy as np diff --git a/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py index 899da500d5..120cc1fd9e 100644 --- a/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py +++ b/examples/pybind11/use_dpctl_syclqueue/tests/test_queue_device.py @@ -1,3 +1,21 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding: utf-8 + import numpy as np import use_queue_device as uqd diff --git a/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py index 83b48b9cd7..ccbca8fd73 100644 --- a/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py +++ b/examples/pybind11/use_dpctl_syclqueue/use_queue_device/__init__.py @@ -1,3 +1,21 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# coding: utf-8 + from ._use_queue_device import ( get_device_global_mem_size, get_device_local_mem_size, @@ -11,3 +29,14 @@ "get_device_local_mem_size", "offloaded_array_mod", ] + +__doc__ = """ +Example pybind11 extension demonstrating binding of dpctl entities to +SYCL entities. + +dpctl provides type casters that bind ``sycl::queue`` to `dpctl.SyclQueue`, +``sycl::device`` to `dpctl.SyclDevice`, etc. + +Use of these type casters simplifies writing of Python extensions and compile +then using SYCL C++ compilers, such as Intel(R) oneAPI DPC++ compiler. +""" From d31977fb3dcc638dfc53fa5ede74792c0f807293 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 26 Apr 2022 11:25:20 -0500 Subject: [PATCH 06/11] Added license headers and sycl_timing_solver.py file --- examples/pybind11/onemkl_gemv/setup.py | 16 ++++++ examples/pybind11/onemkl_gemv/solve.py | 45 ++++++++++++---- .../onemkl_gemv/sycl_gemm/__init__.py | 16 ++++++ .../onemkl_gemv/sycl_gemm/_onemkl.cpp | 28 ++++++++++ .../onemkl_gemv/sycl_timing_solver.py | 53 +++++++++++++++++++ 5 files changed, 147 insertions(+), 11 deletions(-) create mode 100644 examples/pybind11/onemkl_gemv/sycl_timing_solver.py diff --git a/examples/pybind11/onemkl_gemv/setup.py b/examples/pybind11/onemkl_gemv/setup.py index dd00d8e1a9..1ed65e3af8 100644 --- a/examples/pybind11/onemkl_gemv/setup.py +++ b/examples/pybind11/onemkl_gemv/setup.py @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from skbuild import setup setup( diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py index b12681ab35..68bb162290 100644 --- a/examples/pybind11/onemkl_gemv/solve.py +++ b/examples/pybind11/onemkl_gemv/solve.py @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np import sycl_gemm @@ -197,15 +213,16 @@ def cg_solve_numpy(A, b): if __name__ == "__main__": n = 32 - lambda_max = 4 - lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2)))) - # eigenvalues of cartan matrix are Anp = ( 2 * np.eye(n, n, k=0, dtype="d") + np.eye(n, n, k=1, dtype="d") + np.eye(n, n, k=-1, dtype="d") ) bnp = np.geomspace(0.5, 2, n, dtype="d") + # bounds on eigenvalues of cartan matrix are, needed only + # for the Chebyshev solver + lambda_max = 4 + lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2)))) q = dpctl.SyclQueue(property="enable_profiling") A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q) @@ -215,15 +232,21 @@ def cg_solve_numpy(A, b): t = dpctl.SyclTimer() with t(dev.sycl_queue): x, conv = cg_solve(A, b) - print((conv, t.dt)) + print("SYCL solver, 1st run: ", (conv, t.dt)) with t(dev.sycl_queue): x, conv = cg_solve(A, b) - print(dpt.asnumpy(x)) - print((conv, t.dt)) - with t(dev.sycl_queue): - print(check_with_numpy(Anp, bnp)) - print(t.dt) + print("SYCL solver, 2nd run: ", (conv, t.dt)) + + x_ref = check_with_numpy(Anp, bnp) # solve usign LU solver + with t(dev.sycl_queue): x_np, conv = cg_solve_numpy(dpt.asnumpy(A), dpt.asnumpy(b)) - print(x_np, conv) - print(t.dt) + print("NumPy's powered CG solver: ", (conv, t.dt)) + print( + "SYCL cg-solver solution close to reference: ", + np.allclose(dpt.asnumpy(x), x_ref), + ) + print( + "NumPy's cg-solver solution close to reference: ", + np.allclose(x_np, x_ref), + ) diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py index 45af05e21b..b2939f246d 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from ._onemkl import ( axpby_inplace, dot_blocking, diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 84d33bc91e..3a4cf715ab 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -1,3 +1,31 @@ +//==- _onemkl.cpp - Example of Pybind11 extension working with -===// +// dpctl Python objects. +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2021 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements Pybind11-generated extension exposing functions that +/// take dpctl Python objects, such as dpctl.SyclQueue, dpctl.SyclDevice, and +/// dpctl.tensor.usm_ndarray as arguments. +/// +//===----------------------------------------------------------------------===// + // clang-format off #include #include diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py new file mode 100644 index 0000000000..3cd077d5a9 --- /dev/null +++ b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py @@ -0,0 +1,53 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +# coding: utf-8 +import solve + +import dpctl +import dpctl.tensor as dpt + +n = 1000 +rank = 16 + +print( + f"Solving {n} by {n} diagonal linear " + f"system with rank {rank} perturbation." +) + +Anp = np.eye(n, n) + (lambda x: x.T @ x)(np.random.randn(rank, n)) +bnp = np.random.rand(n) + +q = dpctl.SyclQueue(property=["enable_profiling"]) +q.print_device_info() +if q.is_in_order: + print("Using in-order queue") +else: + print("Using not in-order queue") + +api_dev = dpctl.tensor.Device.create_device(q) +A = dpt.asarray(Anp, "d", device=api_dev) +b = dpt.asarray(bnp, "d", device=api_dev) + +timer = dpctl.SyclTimer(time_scale=1e3) + +for i in range(20): + with timer(api_dev.sycl_queue): + solve.cg_solve(A, b) + + print(i, timer.dt) From d768c9b41875f267160ec1216f276e308742076d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 26 Apr 2022 11:48:54 -0500 Subject: [PATCH 07/11] Special-case b==1 case in y[i] = a*x[i] + b*y[i] kernel --- .../onemkl_gemv/sycl_gemm/_onemkl.cpp | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 3a4cf715ab..0328679dd8 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -240,6 +240,7 @@ sub(sycl::queue q, return std::make_pair(ht_event, res_ev); } +template class axpy_inplace_kern; template class axpby_inplace_kern; template @@ -259,11 +260,20 @@ sycl::event axpby_inplace_impl(sycl::queue q, sycl::event res_ev = q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - cgh.parallel_for>(sycl::range<1>{nelems}, - [=](sycl::id<1> id) { - auto i = id.get(0); - y[i] = a * x[i] + b * y[i]; - }); + if (b == T(1)) { + cgh.parallel_for>(sycl::range<1>{nelems}, + [=](sycl::id<1> id) { + auto i = id.get(0); + y[i] += a * x[i]; + }); + } + else { + cgh.parallel_for>( + sycl::range<1>{nelems}, [=](sycl::id<1> id) { + auto i = id.get(0); + y[i] = b * y[i] + a * x[i]; + }); + } }); return res_ev; From bd9bc5f3be90284e5cf5dff01f1a91aaadf89f05 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 26 Apr 2022 11:49:48 -0500 Subject: [PATCH 08/11] Make comments in solve.py more readable --- examples/pybind11/onemkl_gemv/solve.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py index 68bb162290..676ce9c16d 100644 --- a/examples/pybind11/onemkl_gemv/solve.py +++ b/examples/pybind11/onemkl_gemv/solve.py @@ -146,24 +146,27 @@ def cg_solve(A, b): e_p = dpctl.SyclEvent() e_x = dpctl.SyclEvent() for i in range(max_iters): - he_dot, e_dot = sycl_gemm.gemv( - exec_queue, A, p, Ap, depends=[e_p] - ) # Ap = A @ p + # Ap = A @ p + he_dot, e_dot = sycl_gemm.gemv(exec_queue, A, p, Ap, depends=[e_p]) all_host_tasks.append(he_dot) - alpha = rsold / sycl_gemm.dot_blocking( # alpha = rsold / dot(p, Ap) + # alpha = rsold / dot(p, Ap) + alpha = rsold / sycl_gemm.dot_blocking( exec_queue, p, Ap, depends=[e_dot] ) + # x = x + alpha * p he1_axpby, e1_axpby = sycl_gemm.axpby_inplace( exec_queue, alpha, p, 1, x, depends=[e_p, e_x] - ) # x = x + alpha * p + ) all_host_tasks.append(he1_axpby) e_x = e1_axpby + # r = r - alpha * Ap he2_axpby, e2_axpby = sycl_gemm.axpby_inplace( exec_queue, -alpha, Ap, 1, r, depends=[e_p] - ) # r = r - alpha * Ap + ) all_host_tasks.append(he2_axpby) + # rsnew = dot(r, r) rsnew = sycl_gemm.norm_squared_blocking( exec_queue, r, depends=[e2_axpby] ) @@ -173,9 +176,10 @@ def cg_solve(A, b): break beta = rsnew / rsold + # p = r + beta * p he3_axpby, e3_axpby = sycl_gemm.axpby_inplace( exec_queue, 1, r, beta, p, depends=[e1_axpby, e2_axpby] - ) # p = r + beta * p + ) rsold = rsnew all_host_tasks.append(he3_axpby) From c11811a349975cd6f34a8698af25658f6fbd3216 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 26 Apr 2022 11:56:45 -0500 Subject: [PATCH 09/11] Added an empty line for readability --- examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 0328679dd8..afd030d6f7 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -560,6 +560,7 @@ PYBIND11_MODULE(_onemkl, m) { // Import the dpctl extensions import_dpctl(); + m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)", py::arg("exec_queue"), py::arg("Amatrix"), py::arg("xvec"), py::arg("resvec"), py::arg("depends") = py::list()); From 3467eeb3717a127858792c6b1cf0609dee71bb47 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 27 Apr 2022 07:25:48 -0500 Subject: [PATCH 10/11] Factored out common routines into cg_solver.hpp The header and routines defined there are also used in standalone_cpp target built from cpp/main.cpp ``` (idp_2022) [08:27:56 ansatnuc04 onemkl_gemv]$ $(find . -name cmake-build)/standalone_cpp 1000 11 Solving 1000 by 1000 diagonal system with rank-11 perturbation. Converged in : 11 , 11 , 11 , 11 , 11 , 11 , Wall-clock cg_solve execution times: 683.416 , 408.391 , 411.849 , 412.661 , 412.317 , 412.658 , Redisual norm squared: 9.15541e-25 (idp_2022) [08:28:20 ansatnuc04 onemkl_gemv]$ python sycl_timing_solver.py 1000 11 Solving 1000 by 1000 diagonal linear system with rank 11 perturbation. Name Intel(R) UHD Graphics [0x9bca] Driver version 1.3.22992 Vendor Intel(R) Corporation Profile FULL_PROFILE Filter string level_zero:gpu:0 Using not in-order queue 0 (host_dt, device_dt)= (1156.8981241434813, 404.2941620000001) 1 (host_dt, device_dt)= (422.38221131265163, 404.5959500000001) 2 (host_dt, device_dt)= (423.1058154255152, 404.4037220000001) 3 (host_dt, device_dt)= (422.79740050435066, 403.78802800000005) 4 (host_dt, device_dt)= (424.4473725557327, 404.0881560000001) 5 (host_dt, device_dt)= (425.1241758465767, 404.55793600000004) Converged in: [11, 11, 11, 11, 11, 11] Python solution residual norm squared: 3.5289368114384933e-25 ``` README is also expanded. --- examples/pybind11/onemkl_gemv/CMakeLists.txt | 16 +- examples/pybind11/onemkl_gemv/README.md | 23 +- examples/pybind11/onemkl_gemv/cpp/main.cpp | 128 ++++++++++ examples/pybind11/onemkl_gemv/solve.py | 24 +- .../onemkl_gemv/sycl_gemm/_onemkl.cpp | 126 +++------- .../onemkl_gemv/sycl_gemm/cg_solver.hpp | 226 ++++++++++++++++++ .../onemkl_gemv/sycl_timing_solver.py | 31 ++- 7 files changed, 465 insertions(+), 109 deletions(-) create mode 100644 examples/pybind11/onemkl_gemv/cpp/main.cpp create mode 100644 examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp diff --git a/examples/pybind11/onemkl_gemv/CMakeLists.txt b/examples/pybind11/onemkl_gemv/CMakeLists.txt index 3ba71c6252..848e8c7727 100644 --- a/examples/pybind11/onemkl_gemv/CMakeLists.txt +++ b/examples/pybind11/onemkl_gemv/CMakeLists.txt @@ -38,7 +38,7 @@ pybind11_add_module(${py_module_name} sycl_gemm/_onemkl.cpp ) target_include_directories(${py_module_name} - PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} + PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm ) target_link_libraries(${py_module_name} PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb} @@ -53,4 +53,18 @@ set_source_files_properties(${_sycl_gemm_sources} COMPILE_OPTIONS "-O3;-Wno-deprecated-declarations" ) +add_executable(standalone_cpp + EXCLUDE_FROM_ALL + cpp/main.cpp +) +target_compile_options(standalone_cpp + PRIVATE -O3 -Wno-deprecated-declarations +) +target_include_directories(standalone_cpp + PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm + ) +target_link_libraries(standalone_cpp + PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb} +) + set(ignoreMe "${SKBUILD}") diff --git a/examples/pybind11/onemkl_gemv/README.md b/examples/pybind11/onemkl_gemv/README.md index 9645a4dd92..e984a5e013 100644 --- a/examples/pybind11/onemkl_gemv/README.md +++ b/examples/pybind11/onemkl_gemv/README.md @@ -1,9 +1,15 @@ Example of SYCL built pybind11 extension -To build, use (assumes scikit-build and dpcpp) is installed +To build, use (assumes scikit-build and dpcpp is installed): ```sh -python setup.py develop -- -G "Ninja" -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icpx -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include +python setup.py develop -- -G "Ninja" \ + -DCMAKE_C_COMPILER:PATH=icx \ + -DCMAKE_CXX_COMPILER:PATH=icpx \ + -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib \ + -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib \ + -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include \ + -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include ``` To run test suite @@ -11,3 +17,16 @@ To run test suite ```sh python -m pytest tests ``` + +To compare Python overhead, + +``` +# build standad-alone executable +cmake --build $(find . -name cmake-build) --target standalone_cpp +# execute it +$(find . -name cmake-build)/standalone_cpp 1000 11 +# launch Python computatin +python sycl_timing_solver.py 1000 11 +``` + +Compare host times vs. C++ wall-clock times while making sure that the number of iterations is the same diff --git a/examples/pybind11/onemkl_gemv/cpp/main.cpp b/examples/pybind11/onemkl_gemv/cpp/main.cpp new file mode 100644 index 0000000000..c0f9a6115b --- /dev/null +++ b/examples/pybind11/onemkl_gemv/cpp/main.cpp @@ -0,0 +1,128 @@ +#include "cg_solver.hpp" +#include +#include +#include +#include + +using T = double; + +int main(int argc, char *argv[]) +{ + size_t n = 1000; + size_t rank = 16; + + if (argc > 1) { + n = std::stoi(argv[1]); + } + + if (argc > 2) { + rank = std::stoi(argv[2]); + } + + std::cout << "Solving " << n << " by " << n << " diagonal system with rank-" + << rank << " perturbation." << std::endl; + + sycl::queue q; + + // USM allocation for data needed by program + size_t buf_size = n * n + rank * n + 2 * n; + T *buf = sycl::malloc_device(buf_size, q); + sycl::event memset_ev = q.fill(buf, T(0), buf_size); + + T *Amat = buf; + T *umat = buf + n * n; + T *bvec = umat + rank * n; + T *sol_vec = bvec + n; + + sycl::event set_diag_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on({memset_ev}); + cgh.parallel_for({n}, [=](sycl::id<1> id) { + auto i = id[0]; + Amat[i * (n + 1)] = T(1); + }); + }); + + oneapi::mkl::rng::philox4x32x10 engine(q, 7777); + oneapi::mkl::rng::gaussian + distr(0.0, 1.0); + + // populate umat and bvec in one call + sycl::event umat_rand_ev = + oneapi::mkl::rng::generate(distr, engine, n * rank + n, umat); + + sycl::event syrk_ev = oneapi::mkl::blas::row_major::syrk( + q, oneapi::mkl::uplo::U, oneapi::mkl::transpose::N, n, rank, T(1), umat, + rank, T(1), Amat, n, {umat_rand_ev, set_diag_ev}); + + // need to transpose + sycl::event transpose_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(syrk_ev); + cgh.parallel_for({n * n}, [=](sycl::id<1> id) { + size_t i = id[0]; + size_t i0 = i / n; + size_t i1 = i - i0 * n; + if (i0 > i1) { + Amat[i] = Amat[i1 * n + i0]; + } + }); + }); + + q.wait(); + + constexpr int reps = 6; + + std::vector time; + std::vector conv_iters; + + time.reserve(reps); + conv_iters.reserve(reps); + for (int i = 0; i < reps; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + int conv_iter_count = cg_solver::cg_solve(q, n, Amat, bvec, sol_vec); + auto end = std::chrono::high_resolution_clock::now(); + + time.push_back( + std::chrono::duration_cast(end - start) + .count() * + 1e-06); + + conv_iters.push_back(conv_iter_count); + } + + std::cout << "Converged in : "; + for (auto &el : conv_iters) { + std::cout << el << " , "; + } + std::cout << std::endl; + + std::cout << "Wall-clock cg_solve execution times: "; + for (auto &el : time) { + std::cout << el << " , "; + } + std::cout << std::endl; + + T *Ax = sycl::malloc_device(2 * n + 1, q); + T *delta = Ax + n; + + sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv( + q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, sol_vec, 1, T(0), Ax, + 1); + + sycl::event sub_ev = oneapi::mkl::vm::sub(q, n, Ax, bvec, delta, {gemv_ev}, + oneapi::mkl::vm::mode::ha); + + T *n2 = delta + n; + sycl::event dot_ev = oneapi::mkl::blas::row_major::dot( + q, n, delta, 1, delta, 1, n2, {sub_ev}); + + T n2_host{}; + q.copy(n2, &n2_host, 1, {dot_ev}).wait_and_throw(); + + std::cout << "Redisual norm squared: " << n2_host << std::endl; + + q.wait_and_throw(); + sycl::free(Ax, q); + sycl::free(buf, q); + + return 0; +} diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py index 676ce9c16d..3bee649afb 100644 --- a/examples/pybind11/onemkl_gemv/solve.py +++ b/examples/pybind11/onemkl_gemv/solve.py @@ -154,36 +154,37 @@ def cg_solve(A, b): exec_queue, p, Ap, depends=[e_dot] ) # x = x + alpha * p - he1_axpby, e1_axpby = sycl_gemm.axpby_inplace( + he1_x_update, e1_x_update = sycl_gemm.axpby_inplace( exec_queue, alpha, p, 1, x, depends=[e_p, e_x] ) - all_host_tasks.append(he1_axpby) - e_x = e1_axpby + all_host_tasks.append(he1_x_update) + e_x = e1_x_update # r = r - alpha * Ap - he2_axpby, e2_axpby = sycl_gemm.axpby_inplace( + he2_r_update, e2_r_update = sycl_gemm.axpby_inplace( exec_queue, -alpha, Ap, 1, r, depends=[e_p] ) - all_host_tasks.append(he2_axpby) + all_host_tasks.append(he2_r_update) # rsnew = dot(r, r) rsnew = sycl_gemm.norm_squared_blocking( - exec_queue, r, depends=[e2_axpby] + exec_queue, r, depends=[e2_r_update] ) if rsnew < 1e-20: - e1_axpby.wait() + e1_x_update.wait() converged = i break beta = rsnew / rsold # p = r + beta * p - he3_axpby, e3_axpby = sycl_gemm.axpby_inplace( - exec_queue, 1, r, beta, p, depends=[e1_axpby, e2_axpby] + he3_p_update, e3_p_update = sycl_gemm.axpby_inplace( + exec_queue, 1, r, beta, p, depends=[e2_r_update] ) rsold = rsnew - all_host_tasks.append(he3_axpby) - e_p = e3_axpby + all_host_tasks.append(he3_p_update) + e_p = e3_p_update + e_x = e1_x_update dpctl.SyclEvent.wait_for(all_host_tasks) return x, converged @@ -229,6 +230,7 @@ def cg_solve_numpy(A, b): lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2)))) q = dpctl.SyclQueue(property="enable_profiling") + q.print_device_info() A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q) dev = A.device b = dpt.asarray(bnp, dtype="d", usm_type="device", device=dev) diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index afd030d6f7..080bf3135f 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -29,6 +29,7 @@ // clang-format off #include #include +#include "cg_solver.hpp" #include #include #include @@ -40,11 +41,11 @@ namespace py = pybind11; using dpctl::utils::keep_args_alive; std::pair -gemv(sycl::queue q, - dpctl::tensor::usm_ndarray matrix, - dpctl::tensor::usm_ndarray vector, - dpctl::tensor::usm_ndarray result, - const std::vector &depends = {}) +py_gemv(sycl::queue q, + dpctl::tensor::usm_ndarray matrix, + dpctl::tensor::usm_ndarray vector, + dpctl::tensor::usm_ndarray result, + const std::vector &depends = {}) { if (matrix.get_ndim() != 2 || vector.get_ndim() != 1 || result.get_ndim() != 1) { @@ -139,8 +140,6 @@ gemv(sycl::queue q, return std::make_pair(ht_event, res_ev); } -template class sub_kern; - template sycl::event sub_impl(sycl::queue q, size_t n, @@ -153,24 +152,18 @@ sycl::event sub_impl(sycl::queue q, const T *v2 = reinterpret_cast(v2_i); T *r = reinterpret_cast(r_i); - sycl::event r_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>(sycl::range<1>{n}, [=](sycl::id<1> id) { - auto i = id.get(0); - r[i] = v1[i] - v2[i]; - }); - }); + sycl::event r_ev = cg_solver::detail::sub(q, n, v1, v2, r, depends); return r_ev; } // out_r = in_v1 - in_v2 std::pair -sub(sycl::queue q, - dpctl::tensor::usm_ndarray in_v1, - dpctl::tensor::usm_ndarray in_v2, - dpctl::tensor::usm_ndarray out_r, - const std::vector &depends = {}) +py_sub(sycl::queue q, + dpctl::tensor::usm_ndarray in_v1, + dpctl::tensor::usm_ndarray in_v2, + dpctl::tensor::usm_ndarray out_r, + const std::vector &depends = {}) { if (in_v1.get_ndim() != 1 || in_v2.get_ndim() != 1 || out_r.get_ndim() != 1) { @@ -258,35 +251,20 @@ sycl::event axpby_inplace_impl(sycl::queue q, const T *x = reinterpret_cast(x_typeless); T *y = reinterpret_cast(y_typeless); - sycl::event res_ev = q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - if (b == T(1)) { - cgh.parallel_for>(sycl::range<1>{nelems}, - [=](sycl::id<1> id) { - auto i = id.get(0); - y[i] += a * x[i]; - }); - } - else { - cgh.parallel_for>( - sycl::range<1>{nelems}, [=](sycl::id<1> id) { - auto i = id.get(0); - y[i] = b * y[i] + a * x[i]; - }); - } - }); + sycl::event res_ev = + cg_solver::detail::axpby_inplace(q, nelems, a, x, b, y, depends); return res_ev; } // y = a * x + b * y std::pair -axpby_inplace(sycl::queue q, - py::object a, - dpctl::tensor::usm_ndarray x, - py::object b, - dpctl::tensor::usm_ndarray y, - const std::vector &depends = {}) +py_axpby_inplace(sycl::queue q, + py::object a, + dpctl::tensor::usm_ndarray x, + py::object b, + dpctl::tensor::usm_ndarray y, + const std::vector &depends = {}) { if (x.get_ndim() != 1 || y.get_ndim() != 1) { @@ -352,8 +330,6 @@ axpby_inplace(sycl::queue q, return std::make_pair(ht_event, res_ev); } -template class norm_squared_blocking_kern; - template T norm_squared_blocking_impl(sycl::queue q, size_t nelems, @@ -362,23 +338,7 @@ T norm_squared_blocking_impl(sycl::queue q, { const T *r = reinterpret_cast(r_typeless); - sycl::buffer sum_sq_buf(sycl::range<1>{1}); - - q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - auto sum_sq_reduction = sycl::reduction( - sum_sq_buf, cgh, sycl::plus(), - {sycl::property::reduction::initialize_to_identity{}}); - cgh.parallel_for>( - sycl::range<1>{nelems}, sum_sq_reduction, - [=](sycl::id<1> id, auto &sum_sq) { - auto i = id.get(0); - sum_sq += r[i] * r[i]; - }); - }).wait_and_throw(); - - sycl::host_accessor ha(sum_sq_buf); - return ha[0]; + return cg_solver::detail::norm_squared_blocking(q, nelems, r, depends); } template class complex_norm_squared_blocking_kern; @@ -393,29 +353,13 @@ T complex_norm_squared_blocking_impl( const std::complex *r = reinterpret_cast *>(r_typeless); - sycl::buffer sum_sq_buf(sycl::range<1>{1}); - - q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - auto sum_sq_reduction = sycl::reduction( - sum_sq_buf, cgh, sycl::plus(), - {sycl::property::reduction::initialize_to_identity{}}); - cgh.parallel_for>( - sycl::range<1>{nelems}, sum_sq_reduction, - [=](sycl::id<1> id, auto &sum_sq) { - auto i = id.get(0); - sum_sq += - r[i].real() * r[i].real() + r[i].imag() * r[i].imag(); - }); - }).wait_and_throw(); - - sycl::host_accessor ha(sum_sq_buf); - return ha[0]; + return cg_solver::detail::complex_norm_squared_blocking(q, nelems, r, + depends); } -py::object norm_squared_blocking(sycl::queue q, - dpctl::tensor::usm_ndarray r, - const std::vector depends = {}) +py::object py_norm_squared_blocking(sycl::queue q, + dpctl::tensor::usm_ndarray r, + const std::vector depends = {}) { if (r.get_ndim() != 1) { throw std::runtime_error("Expecting a vector"); @@ -469,10 +413,10 @@ py::object norm_squared_blocking(sycl::queue q, return res; } -py::object dot_blocking(sycl::queue q, - dpctl::tensor::usm_ndarray v1, - dpctl::tensor::usm_ndarray v2, - const std::vector &depends = {}) +py::object py_dot_blocking(sycl::queue q, + dpctl::tensor::usm_ndarray v1, + dpctl::tensor::usm_ndarray v2, + const std::vector &depends = {}) { if (v1.get_ndim() != 1 || v2.get_ndim() != 1) { throw std::runtime_error("Expecting two vectors"); @@ -561,17 +505,17 @@ PYBIND11_MODULE(_onemkl, m) // Import the dpctl extensions import_dpctl(); - m.def("gemv", &gemv, "Uses oneMKL to compute dot(matrix, vector)", + m.def("gemv", &py_gemv, "Uses oneMKL to compute dot(matrix, vector)", py::arg("exec_queue"), py::arg("Amatrix"), py::arg("xvec"), py::arg("resvec"), py::arg("depends") = py::list()); - m.def("sub", &sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"), + m.def("sub", &py_sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"), py::arg("in1"), py::arg("in2"), py::arg("out"), py::arg("depends") = py::list()); - m.def("axpby_inplace", &axpby_inplace, "y = a * x + b * y", + m.def("axpby_inplace", &py_axpby_inplace, "y = a * x + b * y", py::arg("exec_queue"), py::arg("a"), py::arg("x"), py::arg("b"), py::arg("y"), py::arg("depends") = py::list()); - m.def("norm_squared_blocking", &norm_squared_blocking, "norm(r)**2", + m.def("norm_squared_blocking", &py_norm_squared_blocking, "norm(r)**2", py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list()); - m.def("dot_blocking", &dot_blocking, "", py::arg("exec_queue"), + m.def("dot_blocking", &py_dot_blocking, "", py::arg("exec_queue"), py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list()); } diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp b/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp new file mode 100644 index 0000000000..6e0396c7a1 --- /dev/null +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/cg_solver.hpp @@ -0,0 +1,226 @@ +//==- _cg_solver.cpp - C++ impl of conjugate gradient linear solver -==// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2021 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements Pybind11-generated extension exposing functions that +/// take dpctl Python objects, such as dpctl.SyclQueue, dpctl.SyclDevice, and +/// dpctl.tensor.usm_ndarray as arguments. +/// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace cg_solver +{ +namespace detail +{ + +template class sub_kern; + +template +sycl::event sub(sycl::queue &q, + size_t n, + const T *v1, + const T *v2, + T *r, + const std::vector &depends = {}) +{ + sycl::event r_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>(sycl::range<1>{n}, [=](sycl::id<1> id) { + auto i = id.get(0); + r[i] = v1[i] - v2[i]; + }); + }); + + return r_ev; +} + +template class axpy_inplace_kern; +template class axpby_inplace_kern; + +template +sycl::event axpby_inplace(sycl::queue &q, + size_t nelems, + T a, + const T *x, + T b, + T *y, + const std::vector depends = {}) +{ + + sycl::event res_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + if (b == T(1)) { + cgh.parallel_for>(sycl::range<1>{nelems}, + [=](sycl::id<1> id) { + auto i = id.get(0); + y[i] += a * x[i]; + }); + } + else { + cgh.parallel_for>( + sycl::range<1>{nelems}, [=](sycl::id<1> id) { + auto i = id.get(0); + y[i] = b * y[i] + a * x[i]; + }); + } + }); + + return res_ev; +} + +template class norm_squared_blocking_kern; + +template +T norm_squared_blocking(sycl::queue &q, + size_t nelems, + const T *r, + const std::vector &depends = {}) +{ + sycl::buffer sum_sq_buf(sycl::range<1>{1}); + + q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + auto sum_sq_reduction = sycl::reduction( + sum_sq_buf, cgh, sycl::plus(), + {sycl::property::reduction::initialize_to_identity{}}); + cgh.parallel_for>( + sycl::range<1>{nelems}, sum_sq_reduction, + [=](sycl::id<1> id, auto &sum_sq) { + auto i = id.get(0); + sum_sq += r[i] * r[i]; + }); + }).wait_and_throw(); + + sycl::host_accessor ha(sum_sq_buf); + return ha[0]; +} + +template class complex_norm_squared_blocking_kern; + +template +T complex_norm_squared_blocking(sycl::queue &q, + size_t nelems, + const std::complex *r, + const std::vector &depends = {}) +{ + sycl::buffer sum_sq_buf(sycl::range<1>{1}); + + q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + auto sum_sq_reduction = sycl::reduction( + sum_sq_buf, cgh, sycl::plus(), + {sycl::property::reduction::initialize_to_identity{}}); + cgh.parallel_for>( + sycl::range<1>{nelems}, sum_sq_reduction, + [=](sycl::id<1> id, auto &sum_sq) { + auto i = id.get(0); + sum_sq += + r[i].real() * r[i].real() + r[i].imag() * r[i].imag(); + }); + }).wait_and_throw(); + + sycl::host_accessor ha(sum_sq_buf); + return ha[0]; +} + +} // namespace detail + +template +int cg_solve(sycl::queue exec_q, + std::int64_t n, + const T *Amat, + const T *bvec, + T *sol_vec, + const std::vector &depends = {}, + T rs_threshold = T(1e-20)) +{ + T *x_vec = sol_vec; + sycl::event fill_ev = exec_q.fill(x_vec, T(0), n, depends); + + // n for r, n for p, n for Ap and 1 for pAp_dot_dev + T *r = sycl::malloc_device(3 * n + 1, exec_q); + T *p = r + n; + T *Ap = p + n; + T *pAp_dot_dev = Ap + n; + + sycl::event copy_to_r_ev = exec_q.copy(bvec, r, n, depends); + sycl::event copy_to_p_ev = exec_q.copy(bvec, p, n, depends); + T rs_old = detail::norm_squared_blocking(exec_q, n, r, {copy_to_r_ev}); + + std::int64_t max_iters = n; + + if (rs_old < rs_threshold) { + sycl::free(r, exec_q); + return 0; + } + + int converged_at = max_iters; + sycl::event prev_dep = copy_to_p_ev; + sycl::event e_x = fill_ev; + + for (std::int64_t i = 0; i < max_iters; ++i) { + sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv( + exec_q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, p, 1, T(0), + Ap, 1, {prev_dep}); + + sycl::event pAp_dot_ev = oneapi::mkl::blas::row_major::dot( + exec_q, n, p, 1, Ap, 1, pAp_dot_dev, {prev_dep, gemv_ev}); + + T pAp_dot_host{}; + exec_q.copy(pAp_dot_dev, &pAp_dot_host, 1, {pAp_dot_ev}) + .wait_and_throw(); + T alpha = rs_old / pAp_dot_host; + + // x = x + alpha * p + sycl::event x_update_ev = + detail::axpby_inplace(exec_q, n, alpha, p, T(1), x_vec, {e_x}); + + // r = r - alpha * Ap + sycl::event r_update_ev = + detail::axpby_inplace(exec_q, n, -alpha, Ap, T(1), r); + + T rs_new = detail::norm_squared_blocking(exec_q, n, r, {r_update_ev}); + + if (rs_new < rs_threshold) { + converged_at = i; + x_update_ev.wait(); + break; + } + + T beta = rs_new / rs_old; + + // p = r + beta * p + prev_dep = + detail::axpby_inplace(exec_q, n, T(1), r, beta, p, {r_update_ev}); + e_x = x_update_ev; + + rs_old = rs_new; + } + + sycl::free(r, exec_q); + return converged_at; +} + +} // namespace cg_solver diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py index 3cd077d5a9..048c9f00f7 100644 --- a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py +++ b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py @@ -14,16 +14,27 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys + import numpy as np # coding: utf-8 import solve +import sycl_gemm import dpctl import dpctl.tensor as dpt +argv = sys.argv + n = 1000 -rank = 16 +rank = 11 + +if len(argv) > 1: + n = int(argv[1]) +if len(argv) > 2: + rank = int(argv[2]) + print( f"Solving {n} by {n} diagonal linear " @@ -46,8 +57,20 @@ timer = dpctl.SyclTimer(time_scale=1e3) -for i in range(20): +iters = [] +for i in range(6): with timer(api_dev.sycl_queue): - solve.cg_solve(A, b) + x, conv_in = solve.cg_solve(A, b) + + print(i, "(host_dt, device_dt)=", timer.dt) + iters.append(conv_in) + +print("Converged in: ", iters) - print(i, timer.dt) +r = dpt.empty_like(b) +hev, ev = sycl_gemm.gemv(q, A, x, r) +delta = dpt.empty_like(b) +hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev]) +rs = sycl_gemm.norm_squared_blocking(q, delta) +dpctl.SyclEvent.wait_for([hev, hev2]) +print(f"Python solution residual norm squared: {rs}") From fc7041eaed2faa021f6b270cf3a285776e31883b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 27 Apr 2022 12:08:54 -0500 Subject: [PATCH 11/11] Streamlined Chebyshev solver Expose cpp_cg_solve used in standalone_cpp executable from Python. Invoked that from Python script sycl_timing_solver.py ```bash $ python sycl_timing_solver.py 1000 11 Solving 1000 by 1000 diagonal linear system with rank 11 perturbation. Name Intel(R) UHD Graphics [0x9bca] Driver version 1.3.22992 Vendor Intel(R) Corporation Profile FULL_PROFILE Filter string level_zero:gpu:0 Using not in-order queue 0 (host_dt, device_dt)= (1157.4030127376318, 403.9605020000001) 1 (host_dt, device_dt)= (421.32044583559036, 403.45619400000004) 2 (host_dt, device_dt)= (420.66121101379395, 402.57058400000005) 3 (host_dt, device_dt)= (421.5433243662119, 402.9254920000001) 4 (host_dt, device_dt)= (421.9988752156496, 402.8818340000001) 5 (host_dt, device_dt)= (422.3589450120926, 402.63814600000006) Converged in: [11, 11, 11, 11, 11, 11] Python solution residual norm squared: 3.2839902926527995e-25 0 (host_dt, device_dt)= (412.9443597048521, 403.6290000000001) 1 (host_dt, device_dt)= (413.7023724615574, 403.8434720000001) 2 (host_dt, device_dt)= (413.4188834577799, 403.1985620000001) 3 (host_dt, device_dt)= (413.85203413665295, 402.70404800000006) 4 (host_dt, device_dt)= (416.2806496024132, 404.0513040000001) 5 (host_dt, device_dt)= (417.43320040404797, 404.74999800000006) Converged in: [11, 11, 11, 11, 11, 11] cpp_cg_solve solution residual norm squared: 3.218393087932091e-25 ``` --- examples/pybind11/onemkl_gemv/solve.py | 40 +++--- .../onemkl_gemv/sycl_gemm/__init__.py | 2 + .../onemkl_gemv/sycl_gemm/_onemkl.cpp | 116 ++++++++++++++++++ .../onemkl_gemv/sycl_timing_solver.py | 16 +++ 4 files changed, 154 insertions(+), 20 deletions(-) diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py index 3bee649afb..6304808078 100644 --- a/examples/pybind11/onemkl_gemv/solve.py +++ b/examples/pybind11/onemkl_gemv/solve.py @@ -38,12 +38,10 @@ def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]): p = empty_like(Ax) e_x = dpctl.SyclEvent() - he_dot, e_dot = sycl_gemm.gemv( - exec_queue, A, x, Ax, depends=depends - ) # Ax = A @ x - he_sub, e_sub = sycl_gemm.sub( - exec_queue, b, Ax, r, depends=[e_dot] - ) # r = b - Ax + # Ax = A @ x + _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=depends) + # r = b - Ax + _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot]) r_ev = e_sub for i in range(nIters): z = r @@ -51,31 +49,33 @@ def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]): if i == 0: p[:] = z alpha = 1 / d - he_axpby, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent() + _, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent() elif i == 1: beta = 0.5 * (c * alpha) ** 2 alpha = 1 / (d - beta / alpha) - he_axpby, e_axpby = sycl_gemm.axpby_inplace( + # p = z + beta * p + _, e_axpby = sycl_gemm.axpby_inplace( exec_queue, 1, z, beta, p, depends=[z_ev] - ) # p = z + beta * p + ) else: beta = (c / 2 * alpha) ** 2 alpha = 1 / (d - beta / alpha) - he_axpby, e_axpby = sycl_gemm.axpby_inplace( + # p = z + beta * p + _, e_axpby = sycl_gemm.axpby_inplace( exec_queue, 1, z, beta, p, depends=[z_ev] - ) # p = z + beta * p - h_x, e_x = sycl_gemm.axpby_inplace( + ) + # x = x + alpha * p + _, e_x = sycl_gemm.axpby_inplace( exec_queue, alpha, p, 1, x, depends=[e_axpby, e_x] - ) # x = x + alpha * p - he_dot, e_dot = sycl_gemm.gemv( - exec_queue, A, x, Ax, depends=[e_x] - ) # Ax = A @ x - he_sub, e_sub = sycl_gemm.sub( - exec_queue, b, Ax, r, depends=[e_dot] - ) # r = b - Ax + ) + # Ax = A @ x + _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=[e_x]) + # r = b - Ax + _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot]) + # residual = dot(r, r) residual = sycl_gemm.norm_squared_blocking( exec_queue, r, depends=[e_sub] - ) # residual = dot(r, r) + ) if residual <= 1e-29: print(f"chebyshev: converged in {i} iters") break diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py index b2939f246d..defce5b76f 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py @@ -16,6 +16,7 @@ from ._onemkl import ( axpby_inplace, + cpp_cg_solve, dot_blocking, gemv, norm_squared_blocking, @@ -28,4 +29,5 @@ "axpby_inplace", "norm_squared_blocking", "dot_blocking", + "cpp_cg_solve", ] diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp index 080bf3135f..0531b8880f 100644 --- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp +++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp @@ -62,6 +62,15 @@ py_gemv(sycl::queue q, throw std::runtime_error("Inconsistent shapes."); } + auto q_ctx = q.get_context(); + if (q_ctx != matrix.get_queue().get_context() || + q_ctx != vector.get_queue().get_context() || + q_ctx != result.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue."); + } + int mat_flags = matrix.get_flags(); int v_flags = vector.get_flags(); int r_flags = result.get_flags(); @@ -176,6 +185,14 @@ py_sub(sycl::queue q, throw std::runtime_error("Vectors must have the same length"); } + if (q.get_context() != in_v1.get_queue().get_context() || + q.get_context() != in_v2.get_queue().get_context() || + q.get_context() != out_r.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue"); + } + int in_v1_flags = in_v1.get_flags(); int in_v2_flags = in_v2.get_flags(); int out_r_flags = out_r.get_flags(); @@ -277,6 +294,13 @@ py_axpby_inplace(sycl::queue q, throw std::runtime_error("Vectors must have the same length"); } + if (q.get_context() != x.get_queue().get_context() || + q.get_context() != y.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue"); + } + int x_flags = x.get_flags(); int y_flags = y.get_flags(); @@ -373,6 +397,11 @@ py::object py_norm_squared_blocking(sycl::queue q, throw std::runtime_error("Vector must be contiguous."); } + if (q.get_context() != r.get_queue().get_context()) { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue"); + } + int r_typenum = r.get_typenum(); if ((r_typenum != UAR_DOUBLE) && (r_typenum != UAR_FLOAT) && (r_typenum != UAR_CDOUBLE) && (r_typenum != UAR_CFLOAT)) @@ -437,6 +466,13 @@ py::object py_dot_blocking(sycl::queue q, throw std::runtime_error("Vectors must be contiguous."); } + if (q.get_context() != v1.get_queue().get_context() || + q.get_context() != v2.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocation is not bound to the context in execution queue"); + } + int v1_typenum = v1.get_typenum(); int v2_typenum = v2.get_typenum(); @@ -500,6 +536,80 @@ py::object py_dot_blocking(sycl::queue q, return res; } +int py_cg_solve(sycl::queue exec_q, + dpctl::tensor::usm_ndarray Amat, + dpctl::tensor::usm_ndarray bvec, + dpctl::tensor::usm_ndarray xvec, + double rs_tol, + const std::vector &depends = {}) +{ + if (Amat.get_ndim() != 2 || bvec.get_ndim() != 1 || xvec.get_ndim() != 1) { + throw py::value_error("Expecting a matrix and two vectors"); + } + + py::ssize_t n0 = Amat.get_shape(0); + py::ssize_t n1 = Amat.get_shape(1); + + if (n0 != n1) { + throw py::value_error("Matrix must be square."); + } + + if (n0 != bvec.get_shape(0) || n0 != xvec.get_shape(0)) { + throw py::value_error( + "Dimensions of the matrix and vectors are not consistent."); + } + + bool all_contig = (Amat.get_flags() & USM_ARRAY_C_CONTIGUOUS) && + (bvec.get_flags() & USM_ARRAY_C_CONTIGUOUS) && + (xvec.get_flags() & USM_ARRAY_C_CONTIGUOUS); + if (!all_contig) { + throw py::value_error("All inputs must be C-contiguous"); + } + + int A_typenum = Amat.get_typenum(); + int b_typenum = bvec.get_typenum(); + int x_typenum = xvec.get_typenum(); + + if (A_typenum != b_typenum || A_typenum != x_typenum) { + throw py::value_error("All arrays must have the same type"); + } + + if (exec_q.get_context() != Amat.get_queue().get_context() || + exec_q.get_context() != bvec.get_queue().get_context() || + exec_q.get_context() != xvec.get_queue().get_context()) + { + throw std::runtime_error( + "USM allocations are not bound to context in execution queue"); + } + + const char *A_ch = Amat.get_data(); + const char *b_ch = bvec.get_data(); + char *x_ch = xvec.get_data(); + + if (A_typenum == UAR_DOUBLE) { + using T = double; + int iters = cg_solver::cg_solve( + exec_q, n0, reinterpret_cast(A_ch), + reinterpret_cast(b_ch), reinterpret_cast(x_ch), + depends, static_cast(rs_tol)); + + return iters; + } + else if (A_typenum == UAR_FLOAT) { + using T = float; + int iters = cg_solver::cg_solve( + exec_q, n0, reinterpret_cast(A_ch), + reinterpret_cast(b_ch), reinterpret_cast(x_ch), + depends, static_cast(rs_tol)); + + return iters; + } + else { + throw std::runtime_error( + "Unsupported data type. Use single or double precision."); + } +} + PYBIND11_MODULE(_onemkl, m) { // Import the dpctl extensions @@ -518,4 +628,10 @@ PYBIND11_MODULE(_onemkl, m) py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list()); m.def("dot_blocking", &py_dot_blocking, "", py::arg("exec_queue"), py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list()); + + m.def("cpp_cg_solve", &py_cg_solve, + "Dispatch to call C++ implementation of cg_solve", + py::arg("exec_queue"), py::arg("Amat"), py::arg("bvec"), + py::arg("xvec"), py::arg("rs_squared_tolerance") = py::float_(1e-20), + py::arg("depends") = py::list()); } diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py index 048c9f00f7..42178ee0ed 100644 --- a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py +++ b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py @@ -74,3 +74,19 @@ rs = sycl_gemm.norm_squared_blocking(q, delta) dpctl.SyclEvent.wait_for([hev, hev2]) print(f"Python solution residual norm squared: {rs}") + +x_cpp = dpt.empty_like(b) +iters = [] +for i in range(6): + with timer(api_dev.sycl_queue): + conv_in = sycl_gemm.cpp_cg_solve(q, A, b, x_cpp) + + print(i, "(host_dt, device_dt)=", timer.dt) + iters.append(conv_in) + +print("Converged in: ", iters) +hev, ev = sycl_gemm.gemv(q, A, x_cpp, r) +hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev]) +rs = sycl_gemm.norm_squared_blocking(q, delta) +dpctl.SyclEvent.wait_for([hev, hev2]) +print(f"cpp_cg_solve solution residual norm squared: {rs}")