From 1c67f182ac51fea6290aa64f3e3303bf08e760c7 Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Thu, 15 Sep 2022 01:13:08 -0500
Subject: [PATCH 1/6] Add tril and triu function

---
 dpctl/tensor/__init__.py                    |   4 +
 dpctl/tensor/_ctors.py                      |  32 ++
 dpctl/tensor/libtensor/source/tensor_py.cpp | 311 ++++++++++++++++++++
 3 files changed, 347 insertions(+)
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 6c9ac88fc2..5a31a05b61 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -33,6 +33,8 @@
     linspace,
     ones,
     ones_like,
+    tril,
+    triu,
     zeros,
     zeros_like,
 )
@@ -83,4 +85,6 @@
     "to_numpy",
     "asnumpy",
     "from_dlpack",
+    "tril",
+    "triu",
 ]
diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py
index e3bb8fe3ec..94d24505ca 100644
--- a/dpctl/tensor/_ctors.py
+++ b/dpctl/tensor/_ctors.py
@@ -1116,3 +1116,35 @@ def eye(
         hev, _ = ti._eye(k, dst=res, sycl_queue=sycl_queue)
         hev.wait()
     return res
+
+
+def tril(X, k=0):
+    """
+    tril(X: usm_ndarray, k: int) -> usm_ndarray
+
+    Returns the lower triangular part of a matrix (or a stack of matrices) X.
+    """
+    if type(X) is not dpt.usm_ndarray:
+        raise TypeError
+
+    res = dpt.empty(X.shape, dtype=X.dtype, sycl_queue=X.sycl_queue)
+    hev, _ = ti._tril(sycl_queue=X.sycl_queue, src=X, dst=res, k=k)
+    hev.wait()
+
+    return res
+
+
+def triu(X, k=0):
+    """
+    triu(X: usm_ndarray, k: int) -> usm_ndarray
+
+    Returns the upper triangular part of a matrix (or a stack of matrices) X.
+    """
+    if type(X) is not dpt.usm_ndarray:
+        raise TypeError
+
+    res = dpt.empty(X.shape, dtype=X.dtype, sycl_queue=X.sycl_queue)
+    hev, _ = ti._triu(sycl_queue=X.sycl_queue, src=X, dst=res, k=k)
+    hev.wait()
+
+    return res
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 6f625c5bb4..46f95cac15 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -1879,6 +1879,290 @@ eye(py::ssize_t k,
                           eye_event);
 }
 
+/* =========================== Tril and triu ============================== */
+// define function type
+typedef sycl::event (*tri_fn_ptr_t)(sycl::queue,
+                                    py::ssize_t,   // inner_range  //py::ssize_t
+                                    py::ssize_t,   // outer_range
+                                    char *,        // src_data_ptr
+                                    char *,        // dst_data_ptr
+                                    py::ssize_t,   // nd
+                                    py::ssize_t *, // shape_and_strides
+                                    int,           // k
+                                    const std::vector<sycl::event> &,
+                                    const std::vector<sycl::event> &);
+
+template <typename Ty, bool> class tri_kernel;
+template <typename Ty, bool l>
+sycl::event tri_impl(sycl::queue exec_q,
+                     py::ssize_t inner_range,
+                     py::ssize_t outer_range,
+                     char *src_p,
+                     char *dst_p,
+                     py::ssize_t nd,
+                     py::ssize_t *shape_and_strides,
+                     int k,
+                     const std::vector<sycl::event> &depends,
+                     const std::vector<sycl::event> &additional_depends)
+{
+    constexpr int d2 = 2;
+    py::ssize_t src_s = nd;
+    py::ssize_t dst_s = 2 * nd;
+    py::ssize_t nd_1 = nd - 1;
+    py::ssize_t nd_2 = nd - 2;
+    Ty *src = reinterpret_cast<Ty *>(src_p);
+    Ty *dst = reinterpret_cast<Ty *>(dst_p);
+
+    sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+        cgh.depends_on(additional_depends);
+        cgh.parallel_for<tri_kernel<Ty, l>>(
+            sycl::range<2>(inner_range, outer_range), [=](sycl::item<2> idx) {
+                py::ssize_t src_inner_offset, dst_inner_offset;
+                bool to_copy;
+
+                {
+                    py::ssize_t inner_gid = idx.get_id(0);
+                    CIndexer_array<d2, py::ssize_t> indexer_i(
+                        {shape_and_strides[nd_2], shape_and_strides[nd_1]});
+                    indexer_i.set(inner_gid);
+                    const std::array<py::ssize_t, d2> &inner = indexer_i.get();
+                    src_inner_offset =
+                        inner[0] * shape_and_strides[src_s + nd_2] +
+                        inner[1] * shape_and_strides[src_s + nd_1];
+                    dst_inner_offset =
+                        inner[0] * shape_and_strides[dst_s + nd_2] +
+                        inner[1] * shape_and_strides[dst_s + nd_1];
+
+                    if (l)
+                        to_copy = (inner[0] + k >= inner[1]);
+                    else
+                        to_copy = (inner[0] + k <= inner[1]);
+                }
+
+                py::ssize_t src_offset = 0;
+                py::ssize_t dst_offset = 0;
+                {
+                    py::ssize_t outer_gid = idx.get_id(1);
+                    CIndexer_vector<py::ssize_t> outer(nd - d2);
+                    outer.get_displacement(
+                        outer_gid, shape_and_strides, shape_and_strides + src_s,
+                        shape_and_strides + dst_s, src_offset, dst_offset);
+                }
+
+                src_offset += src_inner_offset;
+                dst_offset += dst_inner_offset;
+
+                dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0);
+            });
+    });
+    return tri_ev;
+}
+
+static tri_fn_ptr_t tril_generic_dispatch_vector[_ns::num_types];
+
+template <typename fnT, typename Ty> struct TrilGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*tril*/ true>;
+        return f;
+    }
+};
+
+static tri_fn_ptr_t triu_generic_dispatch_vector[_ns::num_types];
+
+template <typename fnT, typename Ty> struct TriuGenericFactory
+{
+    fnT get()
+    {
+        fnT f = tri_impl<Ty, /*triu*/ false>;
+        return f;
+    }
+};
+
+std::pair<sycl::event, sycl::event>
+tri(sycl::queue &exec_q,
+    dpctl::tensor::usm_ndarray src,
+    dpctl::tensor::usm_ndarray dst,
+    char part,
+    int k = 0,
+    const std::vector<sycl::event> &depends = {})
+{
+    // array dimensions must be the same
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    if (src_nd < 2) {
+        throw py::value_error("Array dimensions less than 2.");
+    }
+
+    // shapes must be the same
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+
+    bool shapes_equal(true);
+    size_t src_nelems(1);
+
+    for (int i = 0; i < src_nd; ++i) {
+        src_nelems *= static_cast<size_t>(src_shape[i]);
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal) {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    if (dst_typeid != src_typeid) {
+        throw py::value_error("Array dtype are not the same.");
+    }
+
+    // check same contexts
+    sycl::queue src_q = src.get_queue();
+    sycl::queue dst_q = dst.get_queue();
+
+    sycl::context exec_ctx = exec_q.get_context();
+    sycl::device exec_d = exec_q.get_device();
+    if (src_q.get_context() != exec_ctx || dst_q.get_context() != exec_ctx ||
+        src_q.get_device() != exec_d || dst_q.get_device() != exec_d)
+    {
+        throw py::value_error(
+            "Execution queue context is not the same as allocation contexts");
+    }
+
+    using shT = std::vector<py::ssize_t>;
+    int src_flags = src.get_flags();
+    const py::ssize_t *src_strides_raw = src.get_strides_raw();
+    shT src_strides(src_nd);
+    bool is_src_c_contig = ((src_flags & USM_ARRAY_C_CONTIGUOUS) != 0);
+    bool is_src_f_contig = ((src_flags & USM_ARRAY_F_CONTIGUOUS) != 0);
+    if (src_strides_raw == nullptr) {
+        if (is_src_c_contig) {
+            src_strides = c_contiguous_strides(src_nd, src_shape);
+        }
+        else if (is_src_f_contig) {
+            src_strides = f_contiguous_strides(src_nd, src_shape);
+        }
+        else {
+            throw std::runtime_error("Source array has null strides but has "
+                                     "neither C- nor F- contiguous flag set");
+        }
+    }
+    else {
+        for (ssize_t i = 0; i < src_nd; i++) {
+            src_strides[i] = src_strides_raw[i];
+        }
+    }
+
+    int dst_flags = dst.get_flags();
+    const py::ssize_t *dst_strides_raw = dst.get_strides_raw();
+    shT dst_strides(src_nd);
+    bool is_dst_c_contig = ((dst_flags & USM_ARRAY_C_CONTIGUOUS) != 0);
+    bool is_dst_f_contig = ((dst_flags & USM_ARRAY_F_CONTIGUOUS) != 0);
+    if (dst_strides_raw == nullptr) {
+        if (is_dst_c_contig) {
+            dst_strides = c_contiguous_strides(src_nd, src_shape);
+        }
+        else if (is_dst_f_contig) {
+            dst_strides = f_contiguous_strides(src_nd, src_shape);
+        }
+        else {
+            throw std::runtime_error("Source array has null strides but has "
+                                     "neither C- nor F- contiguous flag set");
+        }
+    }
+    else {
+        for (ssize_t i = 0; i < src_nd; i++) {
+            dst_strides[i] = dst_strides_raw[i];
+        }
+    }
+
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    constexpr py::ssize_t src_itemsize = 1; // item size in elements
+    constexpr py::ssize_t dst_itemsize = 1; // item size in elements
+
+    int nd = src_nd - 2;
+    const py::ssize_t *shape = src_shape;
+    const py::ssize_t *p_src_strides = &src_strides[0];
+    const py::ssize_t *p_dst_strides = &dst_strides[0];
+    simplify_iteration_space(nd, shape, p_src_strides, src_itemsize,
+                             is_src_c_contig, is_src_f_contig, p_dst_strides,
+                             dst_itemsize, is_dst_c_contig, is_dst_f_contig,
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    nd += 2;
+    std::vector<py::ssize_t> shape_and_strides(3 * nd);
+
+    std::copy(simplified_shape.begin(), simplified_shape.end(),
+              shape_and_strides.begin());
+    shape_and_strides[nd - 2] = src_shape[src_nd - 2];
+    shape_and_strides[nd - 1] = src_shape[src_nd - 1];
+    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
+              shape_and_strides.begin() + nd);
+    shape_and_strides[2 * nd - 2] = src_strides[src_nd - 2];
+    shape_and_strides[2 * nd - 1] = src_strides[src_nd - 1];
+    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
+              shape_and_strides.begin() + 2 * nd);
+    shape_and_strides[3 * nd - 2] = dst_strides[src_nd - 2];
+    shape_and_strides[3 * nd - 1] = dst_strides[src_nd - 1];
+
+    std::shared_ptr<shT> shp_shape_and_strides =
+        std::make_shared<shT>(shape_and_strides);
+
+    py::ssize_t *dev_shape_and_strides =
+        sycl::malloc_device<ssize_t>(3 * nd, exec_q);
+    if (dev_shape_and_strides == nullptr) {
+        throw std::runtime_error("Unabled to allocate device memory");
+    }
+    sycl::event copy_shape_and_strides = exec_q.copy<ssize_t>(
+        shp_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
+
+    py::ssize_t inner_range =
+        shape_and_strides[nd - 1] * shape_and_strides[nd - 2];
+    py::ssize_t outer_range = src_nelems / inner_range;
+
+    sycl::event tri_ev;
+    if (part == 'l') {
+        auto fn = tril_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src.get_data(), dst.get_data(),
+               nd, dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+    else {
+        auto fn = triu_generic_dispatch_vector[src_typeid];
+        tri_ev =
+            fn(exec_q, inner_range, outer_range, src.get_data(), dst.get_data(),
+               nd, dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+    }
+
+    exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on({tri_ev});
+        auto ctx = exec_q.get_context();
+        cgh.host_task([shp_shape_and_strides, dev_shape_and_strides, ctx]() {
+            sycl::free(dev_shape_and_strides, ctx);
+        });
+    });
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, {tri_ev}),
+                          tri_ev);
+}
+
 // populate dispatch tables
 void init_copy_and_cast_dispatch_tables(void)
 {
@@ -1936,6 +2220,12 @@ void init_copy_for_reshape_dispatch_vector(void)
     DispatchVectorBuilder<eye_fn_ptr_t, EyeFactory, num_types> dvb4;
     dvb4.populate_dispatch_vector(eye_dispatch_vector);
 
+    DispatchVectorBuilder<tri_fn_ptr_t, TrilGenericFactory, num_types> dvb5;
+    dvb5.populate_dispatch_vector(tril_generic_dispatch_vector);
+
+    DispatchVectorBuilder<tri_fn_ptr_t, TriuGenericFactory, num_types> dvb6;
+    dvb6.populate_dispatch_vector(triu_generic_dispatch_vector);
+
     return;
 }
 
@@ -2081,4 +2371,25 @@ PYBIND11_MODULE(_tensor_impl, m)
           [](sycl::device dev) -> std::string {
               return get_default_device_complex_type(dev);
           });
+    m.def(
+        "_tril",
+        [](sycl::queue exec_q, dpctl::tensor::usm_ndarray src,
+           dpctl::tensor::usm_ndarray dst, int k,
+           const std::vector<sycl::event> depends)
+            -> std::pair<sycl::event, sycl::event> {
+            return tri(exec_q, src, dst, 'l', k, depends);
+        },
+        "Tril helper function.", py::arg("sycl_queue"), py::arg("src"),
+        py::arg("dst"), py::arg("k") = 0, py::arg("depends") = py::list());
+
+    m.def(
+        "_triu",
+        [](sycl::queue exec_q, dpctl::tensor::usm_ndarray src,
+           dpctl::tensor::usm_ndarray dst, int k,
+           const std::vector<sycl::event> depends)
+            -> std::pair<sycl::event, sycl::event> {
+            return tri(exec_q, src, dst, 'u', k, depends);
+        },
+        "Triu helper function.", py::arg("sycl_queue"), py::arg("src"),
+        py::arg("dst"), py::arg("k") = 0, py::arg("depends") = py::list());
 }

From 82b4186d06ab69f66fc888b5639dabd041fdf8bb Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Fri, 16 Sep 2022 15:08:03 -0500
Subject: [PATCH 2/6] Minor changes

---
 dpctl/tensor/libtensor/source/tensor_py.cpp | 74 ++++++++++++---------
 1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 46f95cac15..282e6851ed 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -1888,7 +1888,7 @@ typedef sycl::event (*tri_fn_ptr_t)(sycl::queue,
                                     char *,        // dst_data_ptr
                                     py::ssize_t,   // nd
                                     py::ssize_t *, // shape_and_strides
-                                    int,           // k
+                                    py::ssize_t,   // k
                                     const std::vector<sycl::event> &,
                                     const std::vector<sycl::event> &);
 
@@ -1901,7 +1901,7 @@ sycl::event tri_impl(sycl::queue exec_q,
                      char *dst_p,
                      py::ssize_t nd,
                      py::ssize_t *shape_and_strides,
-                     int k,
+                     py::ssize_t k,
                      const std::vector<sycl::event> &depends,
                      const std::vector<sycl::event> &additional_depends)
 {
@@ -1917,12 +1917,15 @@ sycl::event tri_impl(sycl::queue exec_q,
         cgh.depends_on(depends);
         cgh.depends_on(additional_depends);
         cgh.parallel_for<tri_kernel<Ty, l>>(
-            sycl::range<2>(inner_range, outer_range), [=](sycl::item<2> idx) {
+            sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
+                py::ssize_t outer_gid = idx[0] / inner_range;
+                py::ssize_t inner_gid = idx[0] - inner_range * outer_gid;
+
                 py::ssize_t src_inner_offset, dst_inner_offset;
                 bool to_copy;
 
                 {
-                    py::ssize_t inner_gid = idx.get_id(0);
+                    // py::ssize_t inner_gid = idx.get_id(0);
                     CIndexer_array<d2, py::ssize_t> indexer_i(
                         {shape_and_strides[nd_2], shape_and_strides[nd_1]});
                     indexer_i.set(inner_gid);
@@ -1943,7 +1946,7 @@ sycl::event tri_impl(sycl::queue exec_q,
                 py::ssize_t src_offset = 0;
                 py::ssize_t dst_offset = 0;
                 {
-                    py::ssize_t outer_gid = idx.get_id(1);
+                    // py::ssize_t outer_gid = idx.get_id(1);
                     CIndexer_vector<py::ssize_t> outer(nd - d2);
                     outer.get_displacement(
                         outer_gid, shape_and_strides, shape_and_strides + src_s,
@@ -1986,7 +1989,7 @@ tri(sycl::queue &exec_q,
     dpctl::tensor::usm_ndarray src,
     dpctl::tensor::usm_ndarray dst,
     char part,
-    int k = 0,
+    py::ssize_t k = 0,
     const std::vector<sycl::event> &depends = {})
 {
     // array dimensions must be the same
@@ -2007,7 +2010,7 @@ tri(sycl::queue &exec_q,
     bool shapes_equal(true);
     size_t src_nelems(1);
 
-    for (int i = 0; i < src_nd; ++i) {
+    for (int i = 0; shapes_equal && i < src_nd; ++i) {
         src_nelems *= static_cast<size_t>(src_shape[i]);
         shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
     }
@@ -2032,11 +2035,7 @@ tri(sycl::queue &exec_q,
     sycl::queue src_q = src.get_queue();
     sycl::queue dst_q = dst.get_queue();
 
-    sycl::context exec_ctx = exec_q.get_context();
-    sycl::device exec_d = exec_q.get_device();
-    if (src_q.get_context() != exec_ctx || dst_q.get_context() != exec_ctx ||
-        src_q.get_device() != exec_d || dst_q.get_device() != exec_d)
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) {
         throw py::value_error(
             "Execution queue context is not the same as allocation contexts");
     }
@@ -2060,9 +2059,8 @@ tri(sycl::queue &exec_q,
         }
     }
     else {
-        for (ssize_t i = 0; i < src_nd; i++) {
-            src_strides[i] = src_strides_raw[i];
-        }
+        std::copy(src_strides_raw, src_strides_raw + src_nd,
+                  src_strides.begin());
     }
 
     int dst_flags = dst.get_flags();
@@ -2083,9 +2081,8 @@ tri(sycl::queue &exec_q,
         }
     }
     else {
-        for (ssize_t i = 0; i < src_nd; i++) {
-            dst_strides[i] = dst_strides_raw[i];
-        }
+        std::copy(dst_strides_raw, dst_strides_raw + dst_nd,
+                  dst_strides.begin());
     }
 
     shT simplified_shape;
@@ -2099,14 +2096,20 @@ tri(sycl::queue &exec_q,
 
     int nd = src_nd - 2;
     const py::ssize_t *shape = src_shape;
-    const py::ssize_t *p_src_strides = &src_strides[0];
-    const py::ssize_t *p_dst_strides = &dst_strides[0];
+    const py::ssize_t *p_src_strides = src_strides.data();
+    ;
+    const py::ssize_t *p_dst_strides = dst_strides.data();
+    ;
     simplify_iteration_space(nd, shape, p_src_strides, src_itemsize,
                              is_src_c_contig, is_src_f_contig, p_dst_strides,
                              dst_itemsize, is_dst_c_contig, is_dst_f_contig,
                              simplified_shape, simplified_src_strides,
                              simplified_dst_strides, src_offset, dst_offset);
 
+    if (src_offset != 0 || dst_offset != 0) {
+        throw py::value_error("Reversed slice for dst is not supported");
+    }
+
     nd += 2;
     std::vector<py::ssize_t> shape_and_strides(3 * nd);
 
@@ -2123,7 +2126,7 @@ tri(sycl::queue &exec_q,
     shape_and_strides[3 * nd - 2] = dst_strides[src_nd - 2];
     shape_and_strides[3 * nd - 1] = dst_strides[src_nd - 1];
 
-    std::shared_ptr<shT> shp_shape_and_strides =
+    std::shared_ptr<shT> shp_host_shape_and_strides =
         std::make_shared<shT>(shape_and_strides);
 
     py::ssize_t *dev_shape_and_strides =
@@ -2132,7 +2135,7 @@ tri(sycl::queue &exec_q,
         throw std::runtime_error("Unabled to allocate device memory");
     }
     sycl::event copy_shape_and_strides = exec_q.copy<ssize_t>(
-        shp_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
+        shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
 
     py::ssize_t inner_range =
         shape_and_strides[nd - 1] * shape_and_strides[nd - 2];
@@ -2155,9 +2158,12 @@ tri(sycl::queue &exec_q,
     exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on({tri_ev});
         auto ctx = exec_q.get_context();
-        cgh.host_task([shp_shape_and_strides, dev_shape_and_strides, ctx]() {
-            sycl::free(dev_shape_and_strides, ctx);
-        });
+        cgh.host_task(
+            [shp_host_shape_and_strides, dev_shape_and_strides, ctx]() {
+                // capture of shp_host_shape_and_strides ensure the underlying
+                // vector exists for the entire execution of copying kernel
+                sycl::free(dev_shape_and_strides, ctx);
+            });
     });
     return std::make_pair(keep_args_alive(exec_q, {src, dst}, {tri_ev}),
                           tri_ev);
@@ -2373,23 +2379,25 @@ PYBIND11_MODULE(_tensor_impl, m)
           });
     m.def(
         "_tril",
-        [](sycl::queue exec_q, dpctl::tensor::usm_ndarray src,
-           dpctl::tensor::usm_ndarray dst, int k,
+        [](dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst,
+           py::ssize_t k, sycl::queue exec_q,
            const std::vector<sycl::event> depends)
             -> std::pair<sycl::event, sycl::event> {
             return tri(exec_q, src, dst, 'l', k, depends);
         },
-        "Tril helper function.", py::arg("sycl_queue"), py::arg("src"),
-        py::arg("dst"), py::arg("k") = 0, py::arg("depends") = py::list());
+        "Tril helper function.", py::arg("src"), py::arg("dst"),
+        py::arg("k") = 0, py::arg("sycl_queue"),
+        py::arg("depends") = py::list());
 
     m.def(
         "_triu",
-        [](sycl::queue exec_q, dpctl::tensor::usm_ndarray src,
-           dpctl::tensor::usm_ndarray dst, int k,
+        [](dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst,
+           py::ssize_t k, sycl::queue exec_q,
            const std::vector<sycl::event> depends)
             -> std::pair<sycl::event, sycl::event> {
             return tri(exec_q, src, dst, 'u', k, depends);
         },
-        "Triu helper function.", py::arg("sycl_queue"), py::arg("src"),
-        py::arg("dst"), py::arg("k") = 0, py::arg("depends") = py::list());
+        "Triu helper function.", py::arg("src"), py::arg("dst"),
+        py::arg("k") = 0, py::arg("sycl_queue"),
+        py::arg("depends") = py::list());
 }

From ef5f6155cca1a2a2396861b891c96bf95259625d Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Fri, 16 Sep 2022 15:12:01 -0500
Subject: [PATCH 3/6] Added special case for order=f

---
 dpctl/tensor/_ctors.py | 62 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py
index 94d24505ca..e5db30ac3e 100644
--- a/dpctl/tensor/_ctors.py
+++ b/dpctl/tensor/_ctors.py
@@ -1127,9 +1127,34 @@ def tril(X, k=0):
     if type(X) is not dpt.usm_ndarray:
         raise TypeError
 
-    res = dpt.empty(X.shape, dtype=X.dtype, sycl_queue=X.sycl_queue)
-    hev, _ = ti._tril(sycl_queue=X.sycl_queue, src=X, dst=res, k=k)
-    hev.wait()
+    k = operator.index(k)
+
+    # F_CONTIGUOUS = 2
+    order = "f" if (X.flags & 2) else "c"
+
+    shape = X.shape
+    nd = X.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    if k >= shape[nd - 1] - 1:
+        res = dpt.empty(
+            X.shape, dtype=X.dtype, order=order, sycl_queue=X.sycl_queue
+        )
+        hev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=X, dst=res, sycl_queue=X.sycl_queue
+        )
+        hev.wait()
+    elif k < -shape[nd - 2]:
+        res = dpt.zeros(
+            X.shape, dtype=X.dtype, order=order, sycl_queue=X.sycl_queue
+        )
+    else:
+        res = dpt.empty(
+            X.shape, dtype=X.dtype, order=order, sycl_queue=X.sycl_queue
+        )
+        hev, _ = ti._tril(src=X, dst=res, k=k, sycl_queue=X.sycl_queue)
+        hev.wait()
 
     return res
 
@@ -1143,8 +1168,33 @@ def triu(X, k=0):
     if type(X) is not dpt.usm_ndarray:
         raise TypeError
 
-    res = dpt.empty(X.shape, dtype=X.dtype, sycl_queue=X.sycl_queue)
-    hev, _ = ti._triu(sycl_queue=X.sycl_queue, src=X, dst=res, k=k)
-    hev.wait()
+    k = operator.index(k)
+
+    # F_CONTIGUOUS = 2
+    order = "f" if (X.flags & 2) else "c"
+
+    shape = X.shape
+    nd = X.ndim
+    if nd < 2:
+        raise ValueError("Array dimensions less than 2.")
+
+    if k > shape[nd - 1]:
+        res = dpt.zeros(
+            X.shape, dtype=X.dtype, order=order, sycl_queue=X.sycl_queue
+        )
+    elif k <= -shape[nd - 2] + 1:
+        res = dpt.empty(
+            X.shape, dtype=X.dtype, order=order, sycl_queue=X.sycl_queue
+        )
+        hev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=X, dst=res, sycl_queue=X.sycl_queue
+        )
+        hev.wait()
+    else:
+        res = dpt.empty(
+            X.shape, dtype=X.dtype, order=order, sycl_queue=X.sycl_queue
+        )
+        hev, _ = ti._triu(src=X, dst=res, k=k, sycl_queue=X.sycl_queue)
+        hev.wait()
 
     return res

From d660147a7e6b388dbc5007bfc5d9aace38635782 Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Fri, 16 Sep 2022 15:14:12 -0500
Subject: [PATCH 4/6] Added memory overlap check

---
 dpctl/tensor/libtensor/source/tensor_py.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 282e6851ed..7e9cf96122 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -2023,6 +2023,21 @@ tri(sycl::queue &exec_q,
         return std::make_pair(sycl::event(), sycl::event());
     }
 
+    // check that arrays do not overlap, and concurrent copying is safe.
+    auto src_offsets = src.get_minmax_offsets();
+    int src_elem_size = src.get_elemsize();
+    int dst_elem_size = dst.get_elemsize();
+
+    bool memory_overlap =
+        ((dst_data - src_data > src_offsets.second * src_elem_size -
+                                    dst_offsets.first * dst_elem_size) &&
+         (src_data - dst_data > dst_offsets.second * dst_elem_size -
+                                    src_offsets.first * src_elem_size));
+    if (memory_overlap) {
+        // TODO: could use a temporary, but this is done by the caller
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
     int src_typenum = src.get_typenum();
     int dst_typenum = dst.get_typenum();
     int src_typeid = array_types.typenum_to_lookup_id(src_typenum);

From d9371ff18c173bbf26793036d0b9612ea4227678 Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Fri, 16 Sep 2022 15:49:22 -0500
Subject: [PATCH 5/6] Minor changes

---
 dpctl/tensor/_ctors.py                      |  4 ++--
 dpctl/tensor/libtensor/source/tensor_py.cpp | 14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py
index e5db30ac3e..0e50f38f05 100644
--- a/dpctl/tensor/_ctors.py
+++ b/dpctl/tensor/_ctors.py
@@ -1130,7 +1130,7 @@ def tril(X, k=0):
     k = operator.index(k)
 
     # F_CONTIGUOUS = 2
-    order = "f" if (X.flags & 2) else "c"
+    order = "F" if (X.flags & 2) else "C"
 
     shape = X.shape
     nd = X.ndim
@@ -1171,7 +1171,7 @@ def triu(X, k=0):
     k = operator.index(k)
 
     # F_CONTIGUOUS = 2
-    order = "f" if (X.flags & 2) else "c"
+    order = "F" if (X.flags & 2) else "C"
 
     shape = X.shape
     nd = X.ndim
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 7e9cf96122..533a36e512 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -2024,7 +2024,10 @@ tri(sycl::queue &exec_q,
     }
 
     // check that arrays do not overlap, and concurrent copying is safe.
+    char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
     auto src_offsets = src.get_minmax_offsets();
+    auto dst_offsets = dst.get_minmax_offsets();
     int src_elem_size = src.get_elemsize();
     int dst_elem_size = dst.get_elemsize();
 
@@ -2112,9 +2115,8 @@ tri(sycl::queue &exec_q,
     int nd = src_nd - 2;
     const py::ssize_t *shape = src_shape;
     const py::ssize_t *p_src_strides = src_strides.data();
-    ;
     const py::ssize_t *p_dst_strides = dst_strides.data();
-    ;
+
     simplify_iteration_space(nd, shape, p_src_strides, src_itemsize,
                              is_src_c_contig, is_src_f_contig, p_dst_strides,
                              dst_itemsize, is_dst_c_contig, is_dst_f_contig,
@@ -2160,14 +2162,14 @@ tri(sycl::queue &exec_q,
     if (part == 'l') {
         auto fn = tril_generic_dispatch_vector[src_typeid];
         tri_ev =
-            fn(exec_q, inner_range, outer_range, src.get_data(), dst.get_data(),
-               nd, dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
     }
     else {
         auto fn = triu_generic_dispatch_vector[src_typeid];
         tri_ev =
-            fn(exec_q, inner_range, outer_range, src.get_data(), dst.get_data(),
-               nd, dev_shape_and_strides, k, depends, {copy_shape_and_strides});
+            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
+               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
     }
 
     exec_q.submit([&](sycl::handler &cgh) {

From b9c560a19d3cae7bf2953688cd852e4828b52e91 Mon Sep 17 00:00:00 2001
From: Natalia Polina <natalia.polina@intel.com>
Date: Fri, 16 Sep 2022 15:50:50 -0500
Subject: [PATCH 6/6] Added tests for tril() and triu() functions

---
 dpctl/tests/test_usm_ndarray_ctor.py | 157 +++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index c77933d56a..feabecb1ff 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -1274,6 +1274,159 @@ def test_eye(dtype, usm_kind):
     assert np.array_equal(Xnp, dpt.asnumpy(X))
 
 
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_tril(dtype):
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+
+    if dtype in ["f8", "c16"] and q.sycl_device.has_aspect_fp64 is False:
+        pytest.skip(
+            "Device does not support double precision floating point type"
+        )
+    shape = (2, 3, 4, 5, 5)
+    X = dpt.reshape(
+        dpt.arange(np.prod(shape), dtype=dtype, sycl_queue=q), shape
+    )
+    Y = dpt.tril(X)
+    Xnp = np.arange(np.prod(shape), dtype=dtype).reshape(shape)
+    Ynp = np.tril(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes[1:])
+def test_triu(dtype):
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+
+    if dtype in ["f8", "c16"] and q.sycl_device.has_aspect_fp64 is False:
+        pytest.skip(
+            "Device does not support double precision floating point type"
+        )
+    shape = (4, 5)
+    X = dpt.reshape(
+        dpt.arange(np.prod(shape), dtype=dtype, sycl_queue=q), shape
+    )
+    Y = dpt.triu(X, 1)
+    Xnp = np.arange(np.prod(shape), dtype=dtype).reshape(shape)
+    Ynp = np.triu(Xnp, 1)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_tril_slice():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    shape = (6, 10)
+    X = dpt.reshape(
+        dpt.arange(np.prod(shape), dtype="int", sycl_queue=q), shape
+    )[1:, ::-2]
+    Y = dpt.tril(X)
+    Xnp = np.arange(np.prod(shape), dtype="int").reshape(shape)[1:, ::-2]
+    Ynp = np.tril(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_triu_permute_dims():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+
+    shape = (2, 3, 4, 5)
+    X = dpt.permute_dims(
+        dpt.reshape(
+            dpt.arange(np.prod(shape), dtype="int", sycl_queue=q), shape
+        ),
+        (3, 2, 1, 0),
+    )
+    Y = dpt.triu(X)
+    Xnp = np.transpose(
+        np.arange(np.prod(shape), dtype="int").reshape(shape), (3, 2, 1, 0)
+    )
+    Ynp = np.triu(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_tril_broadcast_to():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    shape = (5, 5)
+    X = dpt.broadcast_to(dpt.ones((1), dtype="int", sycl_queue=q), shape)
+    Y = dpt.tril(X)
+    Xnp = np.broadcast_to(np.ones((1), dtype="int"), shape)
+    Ynp = np.tril(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+def test_triu_bool():
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+
+    shape = (4, 5)
+    X = dpt.ones((shape), dtype="bool", sycl_queue=q)
+    Y = dpt.triu(X)
+    Xnp = np.ones((shape), dtype="bool")
+    Ynp = np.triu(Xnp)
+    assert Y.dtype == Ynp.dtype
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("k", [-10, -2, -1, 3, 4, 10])
+def test_triu_order_k(order, k):
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    shape = (3, 3)
+    X = dpt.reshape(
+        dpt.arange(np.prod(shape), dtype="int", sycl_queue=q),
+        shape,
+        order=order,
+    )
+    Y = dpt.triu(X, k)
+    Xnp = np.arange(np.prod(shape), dtype="int").reshape(shape, order=order)
+    Ynp = np.triu(Xnp, k)
+    assert Y.dtype == Ynp.dtype
+    assert X.flags == Y.flags
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
+@pytest.mark.parametrize("order", ["F", "C"])
+@pytest.mark.parametrize("k", [-10, -4, -3, 1, 2, 10])
+def test_tril_order_k(order, k):
+    try:
+        q = dpctl.SyclQueue()
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("Queue could not be created")
+    shape = (3, 3)
+    X = dpt.reshape(
+        dpt.arange(np.prod(shape), dtype="int", sycl_queue=q),
+        shape,
+        order=order,
+    )
+    Y = dpt.tril(X, k)
+    Xnp = np.arange(np.prod(shape), dtype="int").reshape(shape, order=order)
+    Ynp = np.tril(Xnp, k)
+    assert Y.dtype == Ynp.dtype
+    assert X.flags == Y.flags
+    assert np.array_equal(Ynp, dpt.asnumpy(Y))
+
+
 def test_common_arg_validation():
     order = "I"
     # invalid order must raise ValueError
@@ -1306,3 +1459,7 @@ def test_common_arg_validation():
         dpt.ones_like(X)
     with pytest.raises(TypeError):
         dpt.full_like(X, 1)
+    with pytest.raises(TypeError):
+        dpt.tril(X)
+    with pytest.raises(TypeError):
+        dpt.triu(X)