From 3fe25706995e76255a931d8ed87786da69db685c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 12 May 2021 11:01:13 -0500 Subject: [PATCH 1/3] Fixed issues in slicing and array construction These were discovered while preparing for customer presentation --- dpctl/tensor/_slicing.pxi | 25 +++++++-- dpctl/tensor/_stride_utils.pxi | 23 ++++++-- dpctl/tensor/_usmarray.pyx | 3 +- dpctl/tests/test_usm_ndarray_ctor.py | 82 +++++++++++++++++++++++++++- 4 files changed, 123 insertions(+), 10 deletions(-) diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi index d4841c46d7..321eff0b0e 100755 --- a/dpctl/tensor/_slicing.pxi +++ b/dpctl/tensor/_slicing.pxi @@ -17,6 +17,23 @@ import numbers +cdef Py_ssize_t _slice_len( + Py_ssize_t sl_start, + Py_ssize_t sl_stop, + Py_ssize_t sl_step +): + """ + Compute len(range(sl_start, sl_stop, sl_step)) + """ + if sl_start == sl_stop: + return 0 + if sl_step > 0: + # 1 + argmax k such htat sl_start + sl_step*k < sl_stop + return 1 + ((sl_stop - sl_start - 1) // sl_step) + else: + return 1 + ((sl_stop - sl_start + 1) // sl_step) + + cdef object _basic_slice_meta(object ind, tuple shape, tuple strides, Py_ssize_t offset): """ @@ -33,9 +50,9 @@ cdef object _basic_slice_meta(object ind, tuple shape, return ((1,) + shape, (0,) + strides, offset) elif isinstance(ind, slice): sl_start, sl_stop, sl_step = ind.indices(shape[0]) - sh0 = (sl_stop - sl_start) // sl_step + sh0 = _slice_len(sl_start, sl_stop, sl_step) str0 = sl_step * strides[0] - new_strides = strides if (sl_step == 1) else (str0,) + strides[1:] + new_strides = strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:] return ( (sh0, ) + shape[1:], new_strides, @@ -101,8 +118,8 @@ cdef object _basic_slice_meta(object ind, tuple shape, elif isinstance(ind_i, slice): k_new = k + 1 sl_start, sl_stop, sl_step = ind_i.indices(shape[k]) - sh_i = (sl_stop - sl_start) // sl_step - str_i = sl_step * strides[k] + sh_i = _slice_len(sl_start, sl_stop, sl_step) + str_i = (1 if sh_i == 0 else sl_step) * strides[k] new_shape.append(sh_i) new_strides.append(str_i) new_offset = new_offset + sl_start * strides[k] diff --git a/dpctl/tensor/_stride_utils.pxi b/dpctl/tensor/_stride_utils.pxi index a2ef0740d0..a3fe92579b 100644 --- a/dpctl/tensor/_stride_utils.pxi +++ b/dpctl/tensor/_stride_utils.pxi @@ -72,7 +72,7 @@ cdef int _from_input_shape_strides( # 0-d array if (nd == 0): - contig[0] = USM_ARRAY_C_CONTIGUOUS + contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS) nelems[0] = 1 min_disp[0] = 0 max_disp[0] = 0 @@ -88,17 +88,28 @@ cdef int _from_input_shape_strides( shape_arr[i] = shape[i] elem_count *= shape_arr[i] if elem_count == 0: - contig[0] = USM_ARRAY_C_CONTIGUOUS + contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS) nelems[0] = 1 min_disp[0] = 0 max_disp[0] = 0 - strides_ptr[0] = (0) + if strides is None: + strides_ptr[0] = (0) + else: + strides_arr = PyMem_Malloc(nd * sizeof(Py_ssize_t)) + if (not strides_arr): + PyMem_Free(shape_ptr[0]); + shape_ptr[0] = (0) + return ERROR_MALLOC + strides_ptr[0] = strides_arr + for i in range(0, nd): + strides_arr[i] = strides[i] return 0 nelems[0] = elem_count - if (strides is None): # no need to allocate and populate strides if (int(order) not in [ord('C'), ord('F'), ord('c'), ord('f')]): + PyMem_Free(shape_ptr[0]); + shape_ptr[0] = (0) return ERROR_INCORRECT_ORDER if order == ord('C') or order == ord('c'): contig[0] = USM_ARRAY_C_CONTIGUOUS @@ -112,6 +123,8 @@ cdef int _from_input_shape_strides( and len(strides) == nd): strides_arr = PyMem_Malloc(nd * sizeof(Py_ssize_t)) if (not strides_arr): + PyMem_Free(shape_ptr[0]); + shape_ptr[0] = (0) return ERROR_MALLOC strides_ptr[0] = strides_arr for i in range(0, nd): @@ -143,6 +156,8 @@ cdef int _from_input_shape_strides( contig[0] = 0 # non-contiguous return 0 else: + PyMem_Free(shape_ptr[0]); + shape_ptr[0] = (0) return ERROR_UNEXPECTED_STRIDES # return ERROR_INTERNAL diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 7663f5415c..442543c947 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -475,7 +475,8 @@ cdef class usm_ndarray: cdef usm_ndarray res res = usm_ndarray.__new__( - usm_ndarray, _meta[0], + usm_ndarray, + _meta[0], dtype=_make_typestr(self.typenum_), strides=_meta[1], buffer=self.base_, diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index 755e48570f..dae17da5e5 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -17,12 +17,14 @@ import numbers import numpy as np +import numpy.lib.stride_tricks as np_st import pytest import dpctl # import dpctl.memory as dpmem import dpctl.tensor as dpt +from dpctl.tensor._usmarray import Device @pytest.mark.parametrize( @@ -112,6 +114,8 @@ def test_properties(): (2, 2, None, 3, 4), (Ellipsis,), (None, slice(0, None, 2), Ellipsis, slice(0, None, 3)), + (None, slice(1, None, 2), Ellipsis, slice(1, None, 3)), + (None, slice(None, -1, -2), Ellipsis, slice(2, None, 3)), ( slice(None, None, -1), slice(None, None, -1), @@ -121,10 +125,86 @@ def test_properties(): ], ) def test_basic_slice(ind): - X = dpt.usm_ndarray((2 * 3, 2 * 4, 3 * 5, 3 * 7), dtype="u1") + X = dpt.usm_ndarray((2 * 3, 2 * 4, 3 * 5, 2 * 7), dtype="u1") Xnp = np.empty(X.shape, dtype=X.dtype) S = X[ind] Snp = Xnp[ind] assert S.shape == Snp.shape assert S.strides == Snp.strides assert S.dtype == X.dtype + + +def _from_numpy(np_ary, device=None, usm_type="shared"): + if type(np_ary) is np.ndarray: + if np_ary.flags["FORC"]: + x = np_ary + else: + x = np.ascontiguous(np_ary) + R = dpt.usm_ndarray( + np_ary.shape, + dtype=np_ary.dtype, + buffer=usm_type, + buffer_ctor_kwargs={ + "queue": Device.create_device(device).sycl_queue + }, + ) + R.usm_data.copy_from_host(x.reshape((-1)).view("|u1")) + return R + else: + raise ValueError("Expected numpy.ndarray, got {}".format(type(np_ary))) + + +def _to_numpy(usm_ary): + if type(usm_ary) is dpt.usm_ndarray: + usm_buf = usm_ary.usm_data + s = usm_buf.nbytes + host_buf = usm_buf.copy_to_host().view(usm_ary.dtype) + usm_ary_itemsize = usm_ary.itemsize + R_offset = ( + usm_ary.__sycl_usm_array_interface__["offset"] * usm_ary_itemsize + ) + R = np.ndarray((s,), dtype="u1", buffer=host_buf) + R = R[R_offset:].view(usm_ary.dtype) + R_strides = (usm_ary_itemsize * si for si in usm_ary.strides) + return np_st.as_strided(R, shape=usm_ary.shape, strides=R_strides) + else: + raise ValueError( + "Expected dpctl.tensor.usm_ndarray, got {}".format(type(usm_ary)) + ) + + +def test_slice_constructor_1d(): + Xh = np.arange(37, dtype="i4") + Xusm = _from_numpy(Xh, device="gpu", usm_type="device") + for ind in [ + slice(1, None, 2), + slice(0, None, 3), + slice(1, None, 3), + slice(2, None, 3), + slice(None, None, -1), + slice(-2, 2, -2), + slice(-1, 1, -2), + slice(None, None, -13), + ]: + assert np.array_equal( + _to_numpy(Xusm[ind]), Xh[ind] + ), "Failed for {}".format(ind) + + +def test_slice_constructor_3d(): + Xh = np.empty((37, 24, 35), dtype="i4") + Xusm = _from_numpy(Xh, device="gpu", usm_type="device") + for ind in [ + slice(1, None, 2), + slice(0, None, 3), + slice(1, None, 3), + slice(2, None, 3), + slice(None, None, -1), + slice(-2, 2, -2), + slice(-1, 1, -2), + slice(None, None, -13), + (slice(None, None, -2), Ellipsis, None, 15), + ]: + assert np.array_equal( + _to_numpy(Xusm[ind]), Xh[ind] + ), "Failed for {}".format(ind) From cc201c3322547400354b9a731f3533aad3cfa604 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 12 May 2021 17:38:38 -0500 Subject: [PATCH 2/3] Filled empty strings in lookup typenum -> str, and typenum->itemsiz 'i4' translates to typenum 5 on Linux, but to typenum 7 on Windows, and this entry in the look-up array was not meaningfully populated --- dpctl/tensor/_types.pxi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/_types.pxi b/dpctl/tensor/_types.pxi index 74a2b534d7..7fcc20baf2 100644 --- a/dpctl/tensor/_types.pxi +++ b/dpctl/tensor/_types.pxi @@ -41,7 +41,7 @@ cdef str _make_typestr(int typenum): Make typestring from type number """ cdef type_to_str = ['|b1', '|i1', '|u1', '|i2', '|u2', - '|i4', '|u4', '', '', '|i8', '|u8', + '|i4', '|u4', '|i4', '|u4', '|i8', '|u8', '|f4', '|f8', '', '|c8', '|c16', ''] if (typenum < 0): @@ -63,8 +63,8 @@ cdef int type_bytesize(int typenum): NPY_USHORT=4 : 2 NPY_INT=5 : 4 NPY_UINT=6 : 4 - NPY_LONG=7 : - NPY_ULONG=8 : + NPY_LONG=7 : 4 + NPY_ULONG=8 : 4 NPY_LONGLONG=9 : 8 NPY_ULONGLONG=10 : 8 NPY_FLOAT=11 : 4 @@ -76,7 +76,7 @@ cdef int type_bytesize(int typenum): NPY_HALF=23 : 2 """ cdef int *type_to_bytesize = [ - 1, 1, 1, 2, 2, 4, 4, 8, 8, 8, 8, 4, 8, -1, 8, 16, -1] + 1, 1, 1, 2, 2, 4, 4, 4, 4, 8, 8, 4, 8, -1, 8, 16, -1] if typenum < 0: return -1 From d240a38cc355200a3cca8a99a0bc877ffaeac0dc Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 13 May 2021 12:35:49 -0500 Subject: [PATCH 3/3] Allow Device.create_device(None) This uses dpctl.SyclQueue() to create the queue from default selector --- dpctl/tensor/_device.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/_device.py b/dpctl/tensor/_device.py index feb4598957..b258c373bc 100644 --- a/dpctl/tensor/_device.py +++ b/dpctl/tensor/_device.py @@ -65,7 +65,10 @@ def create_device(cls, dev): "targeting this device".format(dev) ) else: - obj.sycl_queue_ = dpctl.SyclQueue(dev) + if dev is None: + obj.sycl_queue_ = dpctl.SyclQueue() + else: + obj.sycl_queue_ = dpctl.SyclQueue(dev) return obj @property