From 2475a9022740faafeb367beee73e401c2428e4a3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 5 May 2021 08:18:12 -0500 Subject: [PATCH 1/3] Added dpctl/tensor/_usmarray submodule Added Cython extension class dpctl.tensor.usm_ndarray that represents strided layout array over SYCL USM memory chunk, supporting 3 USM types: 'device', 'shared', 'host'. The container implements constructor, certain properties and basic slicing for now. The container allocates memory using dpctl.memory memory buffers specific to USM type. --- .flake8 | 1 + .gitignore | 1 + dpctl/tensor/__init__.py | 6 + dpctl/tensor/_slicing.pxi | 106 +++++++ dpctl/tensor/_stride_utils.pxi | 196 ++++++++++++ dpctl/tensor/_types.pxi | 110 +++++++ dpctl/tensor/_usmarray.pxd | 23 ++ dpctl/tensor/_usmarray.pyx | 449 +++++++++++++++++++++++++++ dpctl/tests/test_usm_ndarray_ctor.py | 129 ++++++++ libtensor/include/usm_array.hpp | 103 ++++++ setup.py | 52 +++- 11 files changed, 1162 insertions(+), 14 deletions(-) create mode 100755 dpctl/tensor/_slicing.pxi create mode 100644 dpctl/tensor/_stride_utils.pxi create mode 100644 dpctl/tensor/_types.pxi create mode 100644 dpctl/tensor/_usmarray.pxd create mode 100644 dpctl/tensor/_usmarray.pyx create mode 100644 dpctl/tests/test_usm_ndarray_ctor.py create mode 100644 libtensor/include/usm_array.hpp diff --git a/.flake8 b/.flake8 index bbe8a35244..557ad738ea 100644 --- a/.flake8 +++ b/.flake8 @@ -22,6 +22,7 @@ per-file-ignores = dpctl/_sycl_queue_manager.pyx: E999, E225 dpctl/memory/_memory.pyx: E999, E225, E226, E227 dpctl/program/_program.pyx: E999, E225, E226, E227 + dpctl/tensor/_usmarray.pyx: E999, E225, E226, E227 dpctl/tensor/numpy_usm_shared.py: F821 examples/cython/sycl_buffer/_buffer_example.pyx: E999, E225, E402 examples/cython/sycl_direct_linkage/_buffer_example.pyx: E999, E225, E402 diff --git a/.gitignore b/.gitignore index dfe778117a..730b9681f4 100644 --- a/.gitignore +++ b/.gitignore @@ -96,3 +96,4 @@ dpctl/_sycl_event.h dpctl/_sycl_queue.h dpctl/_sycl_queue_manager.h dpctl/memory/_memory.h +dpctl/tensor/_usmarray.h diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index a393af69fe..ae3c0b6e31 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -27,3 +27,9 @@ underlying memory buffer is allocated with a USM shared memory allocator. """ + +from dpctl.tensor._usmarray import usm_ndarray + +__all__ = [ + "usm_ndarray", +] diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi new file mode 100755 index 0000000000..9579567795 --- /dev/null +++ b/dpctl/tensor/_slicing.pxi @@ -0,0 +1,106 @@ +import numbers + + +cdef object _basic_slice_meta(object ind, tuple shape, + tuple strides, Py_ssize_t offset): + """ + + """ + if ind is Ellipsis: + return (shape, strides, offset) + elif ind is None: + return ((1,) + shape, (0,) + strides, offset) + elif isinstance(ind, slice): + sl_start, sl_stop, sl_step = ind.indices(shape[0]) + sh0 = (sl_stop - sl_start) // sl_step + str0 = sl_step * strides[0] + new_strides = strides if (sl_step == 1) else (str0,) + strides[1:] + return ( + (sh0, ) + shape[1:], + new_strides, + offset + sl_start * strides[0] + ) + elif isinstance(ind, numbers.Integral): + if 0 <= ind < shape[0]: + return (shape[1:], strides[1:], offset + ind * strides[0]) + elif -shape[0] <= ind < 0: + return (shape[1:], strides[1:], + offset + (shape[0] + ind) * strides[0]) + else: + raise IndexError( + "Index {0} is out of range for axes 0 with " + "size {1}".format(ind, shape[0])) + elif isinstance(ind, list): + raise NotImplemented + elif isinstance(ind, tuple): + axes_referenced = 0 + ellipses_count = 0 + newaxis_count = 0 + explicit_index = 0 + for i in ind: + if i is None: + newaxis_count = newaxis_count + 1 + elif i is Ellipsis: + ellipses_count = ellipses_count + 1 + elif isinstance(i, slice): + axes_referenced = axes_referenced + 1 + elif isinstance(i, numbers.Integral): + explicit_index = explicit_index + 1 + axes_referenced = axes_referenced + 1 + elif isinstance(i, list): + raise NotImplemented + else: + raise TypeError + if ellipses_count > 1: + raise IndexError( + "an index can only have a sinlge ellipsis ('...')") + if axes_referenced > len(shape): + raise IndexError( + "too many indices for an array, array is " + "{0}-dimensional, but {1} were indexed".format( + len(shape), axes_referenced)) + if ellipses_count: + ellipses_count = len(shape) - axes_referenced + new_shape_len = (newaxis_count + ellipses_count + + axes_referenced - explicit_index) + new_shape = list() + new_strides = list() + k = 0 + new_offset = offset + for i in range(len(ind)): + ind_i = ind[i] + if (ind_i is Ellipsis): + k_new = k + ellipses_count + new_shape.extend(shape[k:k_new]) + new_strides.extend(strides[k:k_new]) + k = k_new + elif ind_i is None: + new_shape.append(1) + new_strides.append(0) + elif isinstance(ind_i, slice): + k_new = k + 1 + sl_start, sl_stop, sl_step = ind_i.indices(shape[k]) + sh_i = (sl_stop - sl_start) // sl_step + str_i = sl_step * strides[k] + new_shape.append(sh_i) + new_strides.append(str_i) + new_offset = new_offset + sl_start * strides[k] + k = k_new + elif isinstance(ind_i, numbers.Integral): + if 0 <= ind_i < shape[k]: + k_new = k + 1 + new_offset = new_offset + ind_i * strides[k] + k = k_new + elif -shape[k] <= ind_i < 0: + k_new = k + 1 + new_offset = new_offset + (shape[k] + ind_i) * strides[k] + k = k_new + else: + raise IndexError( + "Index {0} is out of range for " + "axes {1} with size {2}".format(ind_i, k, shape[k])) + new_shape.extend(shape[k:]) + new_strides.extend(strides[k:]) + return (tuple(new_shape), tuple(new_strides), new_offset) + else: + raise TypeError diff --git a/dpctl/tensor/_stride_utils.pxi b/dpctl/tensor/_stride_utils.pxi new file mode 100644 index 0000000000..190f41d2ae --- /dev/null +++ b/dpctl/tensor/_stride_utils.pxi @@ -0,0 +1,196 @@ +# distutils: language = c++ +# cython: language_level=3 + +from cpython.mem cimport PyMem_Malloc +from cpython.ref cimport Py_INCREF +from cpython.tuple cimport PyTuple_New, PyTuple_SetItem + + +cdef int ERROR_MALLOC = 1 +cdef int ERROR_INTERNAL = -1 +cdef int ERROR_INCORRECT_ORDER = 2 +cdef int ERROR_UNEXPECTED_STRIDES = 3 + +cdef int USM_ARRAY_C_CONTIGUOUS = 1 +cdef int USM_ARRAY_F_CONTIGUOUS = 2 +cdef int USM_ARRAY_WRITEABLE = 4 + + +cdef Py_ssize_t shape_to_elem_count(int nd, Py_ssize_t *shape_arr): + """ + Computes number of elements in an array. + """ + cdef Py_ssize_t count = 1 + for i in range(nd): + count *= shape_arr[i] + return count + + +cdef int _from_input_shape_strides( + int nd, object shape, object strides, int itemsize, char order, + Py_ssize_t **shape_ptr, Py_ssize_t **strides_ptr, + Py_ssize_t *nelems, Py_ssize_t *min_disp, Py_ssize_t *max_disp, + int *contig): + """ + Arguments: nd, shape, strides, itemsize, order + Modifies: + shape_ptr - pointer to C array for shape values + stride_ptr - pointer to C array for strides values + nelems - Number of elements in array + min_disp = min( dot(strides, index), index for shape) + max_disp = max( dor(strides, index), index for shape) + contig = enumation for array contiguity + Returns: 0 on success, error code otherwise. + On success pointers point to allocated arrays, + Otherwise they are set to NULL + """ + cdef int i + cdef int all_incr = 1 + cdef int all_decr = 1 + cdef Py_ssize_t elem_count = 1 + cdef Py_ssize_t min_shift = 0 + cdef Py_ssize_t max_shift = 0 + cdef Py_ssize_t str_i + cdef Py_ssize_t* shape_arr + cdef Py_ssize_t* strides_arr + + # 0-d array + if (nd == 0): + contig[0] = USM_ARRAY_C_CONTIGUOUS + nelems[0] = 1 + min_disp[0] = 0 + max_disp[0] = 0 + shape_ptr[0] = (0) + strides_ptr[0] = (0) + return 0 + + shape_arr = PyMem_Malloc(nd * sizeof(Py_ssize_t)) + if (not shape_arr): + return ERROR_MALLOC + shape_ptr[0] = shape_arr + for i in range(0, nd): + shape_arr[i] = shape[i] + elem_count *= shape_arr[i] + if elem_count == 0: + contig[0] = USM_ARRAY_C_CONTIGUOUS + nelems[0] = 1 + min_disp[0] = 0 + max_disp[0] = 0 + strides_ptr[0] = (0) + return 0 + nelems[0] = elem_count + + if (strides is None): + # no need to allocate and populate strides + if (int(order) not in [ord('C'), ord('F'), ord('c'), ord('f')]): + return ERROR_INCORRECT_ORDER + if order == ord('C') or order == ord('c'): + contig[0] = USM_ARRAY_C_CONTIGUOUS + else: + contig[0] = USM_ARRAY_F_CONTIGUOUS + min_disp[0] = 0 + max_disp[0] = (elem_count - 1) + strides_ptr[0] = (0) + return 0 + elif ((isinstance(strides, (list, tuple)) or hasattr(strides, 'tolist')) + and len(strides) == nd): + strides_arr = PyMem_Malloc(nd * sizeof(Py_ssize_t)) + if (not strides_arr): + return ERROR_MALLOC + strides_ptr[0] = strides_arr + for i in range(0, nd): + str_i = strides[i] + strides_arr[i] = str_i + if str_i > 0: + max_shift += strides_arr[i] * (shape_arr[i] - 1) + else: + min_shift += strides_arr[i] * (shape_arr[i] - 1) + min_disp[0] = min_shift + max_disp[0] = max_shift + if max_shift == min_shift + (elem_count - 1): + if nd == 1: + contig[0] = USM_ARRAY_C_CONTIGUOUS + return 0 + for i in range(0, nd - 1): + if all_incr: + all_incr = strides_arr[i] < strides_arr[i + 1] + if all_decr: + all_decr = strides_arr[i] > strides_arr[i + 1] + if all_incr: + contig[0] = USM_ARRAY_C_CONTIGUOUS + elif all_decr: + contig[0] = USM_ARRAY_F_CONTIGUOUS + else: + contig[0] = 0 + return 0 + else: + contig[0] = 0 # non-contiguous + return 0 + else: + return ERROR_UNEXPECTED_STRIDES + # return ERROR_INTERNAL + + +cdef object _make_int_tuple(int nd, Py_ssize_t *ary): + """ + Makes Python tuple from C array + """ + cdef tuple res + cdef object tmp + if (ary): + res = PyTuple_New(nd) + for i in range(nd): + tmp = ary[i] + Py_INCREF(tmp) # SetItem steals the reference + PyTuple_SetItem(res, i, tmp) + return res + else: + return None + + +cdef object _make_reversed_int_tuple(int nd, Py_ssize_t *ary): + """ + Makes Python reversed tuple from C array + """ + cdef tuple res + cdef object tmp + cdef int i + cdef int nd_1 + if (ary): + res = PyTuple_New(nd) + nd_1 = nd - 1 + for i in range(nd): + tmp = ary[i] + Py_INCREF(tmp) # SetItem steals the reference + PyTuple_SetItem(res, nd_1 - i, tmp) + return res + else: + return None + + +cdef object _c_contig_strides(int nd, Py_ssize_t *shape): + """ + Makes Python tuple for C-contiguous array + """ + cdef tuple cc_strides = PyTuple_New(nd) + cdef object si = 1 + cdef int i + cdef int nd_1 = nd - 1 + for i in range(0, nd): + Py_INCREF(si) # SetItem steals the reference + PyTuple_SetItem(cc_strides, nd_1 - i, si) + si = si * shape[nd_1 - i] + return cc_strides + + +cdef object _f_contig_strides(int nd, Py_ssize_t *shape): + """ + Makes Python t + """ + cdef tuple fc_strides = PyTuple_New(nd) + cdef object si = 1 + for i in range(0, nd): + Py_INCREF(si) # SetItem steals the reference + PyTuple_SetItem(fc_strides, i, si) + si = si * shape[i] + return fc_strides diff --git a/dpctl/tensor/_types.pxi b/dpctl/tensor/_types.pxi new file mode 100644 index 0000000000..e88f79cc5c --- /dev/null +++ b/dpctl/tensor/_types.pxi @@ -0,0 +1,110 @@ +import numpy as np + + +# these typenum values are aligned to values in NumPy +cdef int UAR_BOOL = 0 +cdef int UAR_BYTE = 1 +cdef int UAR_UBYTE = 2 +cdef int UAR_SHORT = 3 +cdef int UAR_USHORT = 4 +cdef int UAR_INT = 5 +cdef int UAR_UINT = 6 +cdef int UAR_LONG = 7 +cdef int UAR_ULONG = 8 +cdef int UAR_LONGLONG = 9 +cdef int UAR_ULONGLONG = 10 +cdef int UAR_FLOAT = 11 +cdef int UAR_DOUBLE = 12 +cdef int UAR_CFLOAT = 14 +cdef int UAR_CDOUBLE = 15 +cdef int UAR_TYPE_SENTINEL = 17 +cdef int UAR_HALF = 23 + +cdef str _make_typestr(int typenum): + """ + Make typestring from type number + """ + cdef type_to_str = ['|b1', '|i1', '|u1', '|i2', '|u2', + '|i4', '|u4', '', '', '|i8', '|u8', + '|f4', '|f8', '', '|c8', '|c16', ''] + + if (typenum < 0): + return "" + if (typenum > 16): + if (typenum == 23): + return "|f2" + return "" + + return type_to_str[typenum] + + +cdef int type_bytesize(int typenum): + """ + NPY_BOOL=0 : 1 + NPY_BYTE=1 : 1 + NPY_UBYTE=2 : 1 + NPY_SHORT=3 : 2 + NPY_USHORT=4 : 2 + NPY_INT=5 : 4 + NPY_UINT=6 : 4 + NPY_LONG=7 : + NPY_ULONG=8 : + NPY_LONGLONG=9 : 8 + NPY_ULONGLONG=10 : 8 + NPY_FLOAT=11 : 4 + NPY_DOUBLE=12 : 8 + NPY_LONGDOUBLE=13 : N/A + NPY_CFLOAT=14 : 8 + NPY_CDOUBLE=15 : 16 + NPY_CLONGDOUBLE=16 : N/A + NPY_HALF=23 : 2 + """ + cdef int *type_to_bytesize = [ + 1, 1, 1, 2, 2, 4, 4, 8, 8, 8, 8, 4, 8, -1, 8, 16, -1] + + if typenum < 0: + return -1 + if typenum > 16: + if typenum == 23: + return 2 + return -1 + + return type_to_bytesize[typenum] + + +cdef int typenum_from_format(str s): + """ + Internal utility to convert string describing type format + + Format is [<|=>][biufc]# + Shortcuts for formats are i, u, d, D + """ + if not s: + raise TypeError("Format string '" + s + "' cannot be empty.") + try: + dt = np.dtype(s) + except Exception as e: + raise TypeError("Format '" + s + "' is not understood.") from e + if (dt.byteorder == ">"): + raise TypeError("Format '" + s + "' can only have native byteorder.") + return dt.num + + +cdef int dtype_to_typenum(dtype): + if isinstance(dtype, str): + return typenum_from_format(dtype) + elif isinstance(dtype, bytes): + return typenum_from_format(dtype.decode("UTF-8")) + elif hasattr(dtype, 'descr'): + obj = getattr(dtype, 'descr') + if (not isinstance(obj, list) or len(obj) != 1): + return -1 + obj = obj[0] + if (not isinstance(obj, tuple) or len(obj) != 2 or obj[0]): + return -1 + obj = obj[1] + if not isinstance(obj, str): + return -1 + return typenum_from_format(dtype) + else: + return -1 diff --git a/dpctl/tensor/_usmarray.pxd b/dpctl/tensor/_usmarray.pxd new file mode 100644 index 0000000000..d3688351df --- /dev/null +++ b/dpctl/tensor/_usmarray.pxd @@ -0,0 +1,23 @@ +# distutils: language = c++ +# cython: language_level=3 + +cdef public int USM_ARRAY_C_CONTIGUOUS +cdef public int USM_ARRAY_F_CONTIGUOUS +cdef public int USM_ARRAY_WRITEABLE + + +cdef public class usm_ndarray [object PyUSMArrayObject, type PyUSMArrayType]: + cdef char* data + cdef int nd + cdef Py_ssize_t *shape + cdef Py_ssize_t *strides + cdef int typenum + cdef int flags + cdef object base + + cdef void _reset(usm_ndarray self) + cdef void _cleanup(usm_ndarray self) + cdef usm_ndarray _clone(usm_ndarray self) + cdef Py_ssize_t get_offset(usm_ndarray self) except * + + cdef __cythonbufferdefaults__ = {"mode": "strided"} diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx new file mode 100644 index 0000000000..a69157ee82 --- /dev/null +++ b/dpctl/tensor/_usmarray.pyx @@ -0,0 +1,449 @@ +# distutils: language = c++ +# cython: language_level=3 + +import numpy as np + +import dpctl +import dpctl.memory as dpmem + +from cpython.mem cimport PyMem_Free +from cpython.tuple cimport PyTuple_New, PyTuple_SetItem + + +cdef extern from "usm_array.hpp" namespace "usm_array": + cdef cppclass usm_array: + usm_array(char *, int, size_t*, Py_ssize_t *, + int, int, DPCTLSyclQueueRef) except + + + +include "_stride_utils.pxi" +include "_types.pxi" +include "_slicing.pxi" + +cdef class InternalUSMArrayError(Exception): + """ + A InternalError exception is raised when internal + inconsistency has been detected. + """ + pass + + +cdef class usm_ndarray: + """ + usm_ndarray( + shape, dtype="|f8", strides=None, buffer='device', + offset=0, order='C', + buffer_ctor_kwargs=dict() + ) + + See :class:`dpctl.memory.MemoryUSMShared` for allowed + keyword arguments. + + `buffer` can be 'shared', 'host', 'device' to allocate + new device memory by calling respective constructor with + the specified `buffer_ctor_kwrds`; `buffer` can be an + instance of :class:`dpctl.memory.MemoryUSMShared`, + :class:`dpctl.memory.MemoryUSMDevice`, or + :class:`dpctl.memory.MemoryUSMHost`; `buffer` can also be + another usm_ndarray instance, in which case its underlying + MemoryUSM* buffer is used for buffer. + """ + + cdef void _reset(usm_ndarray self): + """ + Initializes member fields + """ + self.base = None + self.nd = -1 + self.data = 0 + self.shape = 0 + self.strides = 0 + self.flags = 0 + + cdef void _cleanup(usm_ndarray self): + if (self.shape): + PyMem_Free(self.shape) + if (self.strides): + PyMem_Free(self.strides) + self._reset() + + cdef usm_ndarray _clone(self): + """ + Provides a copy of Python object pointing to the same data + """ + cdef int item_size = type_bytesize(self.typenum) + cdef Py_ssize_t offset_bytes = ( + ( self.data) - + ((self.base._pointer))) + cdef usm_ndarray res = usm_ndarray.__new__( + usm_ndarray, _make_int_tuple(self.nd, self.shape), + dtype=_make_typestr(self.typenum), + strides=( + _make_int_tuple(self.nd, self.strides) if (self.strides) + else None), + buffer=self.base, + offset=(offset_bytes // item_size), + order=('C' if (self.flags & USM_ARRAY_C_CONTIGUOUS) else 'F') + ) + res.flags = self.flags + if (res.data != self.data): + raise InternalUSMArrayError( + "Data pointers of cloned and original objects are different.") + return res + + def __cinit__(self, shape, dtype="|f8", strides=None, buffer='device', + Py_ssize_t offset=0, order='C', buffer_ctor_kwargs=dict()): + """ + strides and offset must be given in units of array elements. + buffer can be strings ('device'|'shared'|'host' to allocate new memory) + or dpctl.memory.MemoryUSM* buffers, or usm_ndrray instances. + """ + cdef int nd = 9 + cdef int typenum = 0 + cdef int itemsize = 0 + cdef int err = 0 + cdef int contig_flag = 0 + cdef Py_ssize_t *shape_ptr = NULL + cdef Py_ssize_t ary_nelems = 0 + cdef Py_ssize_t ary_nbytes = 0 + cdef Py_ssize_t *strides_ptr = NULL + cdef Py_ssize_t _offset = offset + cdef Py_ssize_t ary_min_displacement = 0 + cdef Py_ssize_t ary_max_displacement = 0 + cdef char * data_ptr = NULL + + self._reset() + if (not isinstance(shape, (list, tuple)) + and not hasattr(shape, 'tolist')): + raise TypeError("Argument shape must be a list of a tuple.") + nd = len(shape) + typenum = dtype_to_typenum(dtype) + itemsize = type_bytesize(typenum) + if (itemsize < 1): + raise TypeError("dtype=" + dtype + " is not supported.") + # allocate host C-arrays for shape, strides + err = _from_input_shape_strides( + nd, shape, strides, itemsize, ord(order), + &shape_ptr, &strides_ptr, &ary_nelems, + &ary_min_displacement, &ary_max_displacement, &contig_flag + ) + if (err): + self._cleanup() + if err == ERROR_MALLOC: + raise MemoryError("Memory allocation for shape/strides " + "array failed.") + elif err == ERROR_INCORRECT_ORDER: + raise ValueError( + "Unsupported order='{}' given. " + "Supported values are 'C' or 'F'.".format(order)) + elif err == ERROR_UNEXPECTED_STRIDES: + raise ValueError( + "strides={} is not understood".format(strides)) + else: + raise InternalUSMArrayError( + " .. while processing shape and strides.") + ary_nbytes = (ary_max_displacement - + ary_min_displacement + 1) * itemsize + if isinstance(buffer, dpmem._memory._Memory): + _buffer = buffer + elif isinstance(buffer, (str, bytes)): + if isinstance(buffer, bytes): + buffer = buffer.decode("UTF-8") + _offset = -ary_min_displacement + if (buffer == "shared"): + _buffer = dpmem.MemoryUSMShared(ary_nbytes, + **buffer_ctor_kwargs) + elif (buffer == "device"): + _buffer = dpmem.MemoryUSMDevice(ary_nbytes, + **buffer_ctor_kwargs) + elif (buffer == "host"): + _buffer = dpmem.MemoryUSMHost(ary_nbytes, + **buffer_ctor_kwargs) + else: + self._cleanup() + raise ValueError( + "buffer='{}' is not understood. " + "Recognized values are 'device', 'shared', 'host', " + "or an object with __sycl_usm_array_interface__ " + "property".format(buffer)) + elif isinstance(buffer, usm_ndarray): + _buffer = buffer.usm_data + else: + self._cleanup() + raise ValueError("buffer='{}' was not understood.".format(buffer)) + if (_offset + ary_min_displacement < 0 or + (_offset + ary_max_displacement + 1) * itemsize > _buffer.nbytes): + self._cleanup() + raise ValueError("buffer='{}' can not accomodate the requested " + "array.".format(buffer)) + self.base = _buffer + self.data = ( ( _buffer._pointer)) + itemsize * _offset + self.shape = shape_ptr + self.strides = strides_ptr + self.typenum = typenum + self.flags = contig_flag + self.nd = nd + + def __dealloc__(self): + self._cleanup() + + cdef Py_ssize_t get_offset(self) except *: + cdef char *mem_ptr = NULL + cdef char *ary_ptr = self.data + mem_ptr = ( self.base._pointer) + byte_offset = ary_ptr - mem_ptr + item_size = type_bytesize(self.typenum) + if (byte_offset % item_size): + raise InternalUSMArrayError( + "byte_offset is not a multiple of item_size.") + return byte_offset // item_size + + @property + def __sycl_usm_array_interface__(self): + """ + Gives __sycl_usm_array_interface__ dictionary describing the array + """ + cdef Py_ssize_t byte_offset = -1 + cdef int item_size = -1 + cdef Py_ssize_t elem_offset = -1 + cdef char *mem_ptr = NULL + cdef char *ary_ptr = NULL + if (not isinstance(self.base, dpmem._memory._Memory)): + raise ValueError("Invalid instance of usm_ndarray ecountered") + ary_iface = self.base.__sycl_usm_array_interface__ + mem_ptr = ( ary_iface['data'][0]) + ary_ptr = ( self.data) + ro_flag = False if (self.flags & USM_ARRAY_WRITEABLE) else True + ary_iface['data'] = ( ary_ptr, ro_flag) + ary_iface['shape'] = _make_int_tuple(self.nd, self.shape) + if (self.strides): + ary_iface['strides'] = _make_int_tuple(self.nd, self.strides) + else: + if (self.flags & USM_ARRAY_C_CONTIGUOUS): + ary_iface['strides'] = None + elif (self.flags & USM_ARRAY_F_CONTIGUOUS): + ary_iface['strides'] = _f_contig_strides(self.nd, self.shape) + else: + raise ValueError("USM Array is not contiguous and " + "has empty strides") + ary_iface['typestr'] = _make_typestr(self.typenum) + byte_offset = ary_ptr - mem_ptr + item_size = type_bytesize(self.typenum) + if (byte_offset % item_size): + raise InternalUSMArrayError( + "byte_offset is not a multiple of item_size.") + elem_offset = byte_offset // item_size + ary_iface['offset'] = elem_offset + return ary_iface + + @property + def ndim(self): + """ + Gives the number of indices needed to address elements of this array. + """ + return int(self.nd) + + @property + def usm_data(self): + """ + Gives USM memory object underlying usm_array instance. + """ + return self.base + + @property + def shape(self): + """ + Elements of the shape tuple give the lengths of the + respective array dimensions. + """ + return _make_int_tuple(self.nd, self.shape) if self.nd > 0 else tuple() + + @property + def strides(self): + """ + Returns memory displacement in array elements, upon unit + change of respective index. + + E.g. for strides (s1, s2, s3) and multi-index (i1, i2, i3) + + a[i1, i2, i3] == (&a[0,0,0])[ s1*s1 + s2*i2 + s3*i3] + """ + if (self.strides): + return _make_int_tuple(self.nd, self.strides) + else: + if (self.flags & USM_ARRAY_C_CONTIGUOUS): + return _c_contig_strides(self.nd, self.shape) + elif (self.flags & USM_ARRAY_F_CONTIGUOUS): + return _f_contig_strides(self.nd, self.shape) + else: + raise ValueError("Inconsitent usm_ndarray data") + + @property + def flags(self): + """ + Currently returns integer whose bits correspond to the flags. + """ + return int(self.flags) + + @property + def usm_type(self): + """ + USM type of underlying memory. Can be 'device', 'shared', or 'host'. + + See: https://docs.oneapi.com/versions/latest/dpcpp/iface/usm.html + """ + return self.base.get_usm_type() + + @property + def itemsize(self): + """ + Size of array element in bytes. + """ + return type_bytesize(self.typenum) + + @property + def nbytes(self): + """ + Total bytes consumed by the elements of the array. + """ + return ( + shape_to_elem_count(self.nd, self.shape) * + type_bytesize(self.typenum)) + + @property + def size(self): + """ + Number of elements in the array. + """ + return shape_to_elem_count(self.nd, self.shape) + + @property + def dtype(self): + """ + Returns NumPy's dtype corresponding to the type of the array elements. + """ + return np.dtype(_make_typestr(self.typenum)) + + @property + def sycl_queue(self): + """ + Returns `dpctl.SyclQueue` object associated with USM data. + """ + return self.base._queue + + @property + def sycl_device(self): + """ + Returns `dpctl.SyclDevice` object on which USM data was allocated. + """ + return self.base._queue.sycl_device + + @property + def sycl_context(self): + """ + Returns `dpctl.SyclContext` object to which USM data is bound. + """ + return self.base._queue.sycl_context + + @property + def T(self): + if self.nd < 2: + return self + else: + return _transpose(self) + + @property + def real(self): + if (self.typenum < UAR_CFLOAT): + # elements are real + return self + if (self.typenum < UAR_TYPE_SENTINEL): + return _real_view(self) + + @property + def imag(self): + if (self.typenum < UAR_CFLOAT): + # elements are real + return _zero_like(self) + if (self.typenum < UAR_TYPE_SENTINEL): + return _imag_view(self) + + def __getitem__(self, ind): + cdef tuple _meta = _basic_slice_meta( + ind, (self).shape, ( self).strides, + self.get_offset()) + cdef usm_ndarray res + + res = usm_ndarray.__new__( + usm_ndarray, _meta[0], + dtype=_make_typestr(self.typenum), + strides=_meta[1], + buffer=self.base, + offset=_meta[2] + ) + res.flags |= (self.flags & USM_ARRAY_WRITEABLE) + return res + + +cdef usm_ndarray _real_view(usm_ndarray ary): + """ + View into real parts of a complex type array + """ + cdef usm_ndarray r = ary._clone() + if (ary.typenum == UAR_CFLOAT): + r.typenum = UAR_FLOAT + elif (ary.typenum == UAR_CDOUBLE): + r.typenum = UAR_DOUBLE + else: + raise InternalUSMArrayError( + "_real_view call on array of non-complex type.") + return r + + +cdef usm_ndarray _imag_view(usm_ndarray ary): + """ + View into imaginary parts of a complex type array + """ + cdef usm_ndarray r = ary._clone() + if (ary.typenum == UAR_CFLOAT): + r.typenum = UAR_FLOAT + elif (ary.typenum == UAR_CDOUBLE): + r.typenum = UAR_DOUBLE + else: + raise InternalUSMArrayError( + "_real_view call on array of non-complex type.") + # displace pointer to imaginary part + r.data = r.data + type_bytesize(r.typenum) + return r + + +cdef usm_ndarray _transpose(usm_ndarray ary): + """ + Construct transposed array without copying the data + """ + cdef usm_ndarray r = usm_ndarray.__new__( + usm_ndarray, + _make_reversed_int_tuple(ary.nd, ary.shape), + dtype=_make_typestr(ary.typenum), + strides=( + _make_reversed_int_tuple(ary.nd, ary.strides) + if (ary.strides) else None), + buffer=ary.base, + order=('F' if (ary.flags & USM_ARRAY_C_CONTIGUOUS) else 'C') + ) + r.flags |= (ary.flags & USM_ARRAY_WRITEABLE) + return r + + +cdef usm_ndarray _zero_like(usm_ndarray ary): + """ + Make C-contiguous array of zero elements with same shape + and type as ary. + """ + cdef usm_ndarray r = usm_ndarray( + _make_int_tuple(ary.nd, ary.shape), + dtype=_make_typestr(ary.typenum), + buffer=ary.base.get_usm_type() + ) + # TODO: call function to set array elements to zero + return r diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py new file mode 100644 index 0000000000..fc11bad958 --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -0,0 +1,129 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers + +import numpy as np +import pytest + +import dpctl + +# import dpctl.memory as dpmem +import dpctl.tensor as dpt + + +@pytest.mark.parametrize( + "shape", + [ + (), + (4,), + (0,), + (0, 1), + (0, 0), + (4, 5), + (2, 5, 2), + (2, 2, 2, 2, 2, 2, 2, 2), + ], +) +@pytest.mark.parametrize("usm_type", ["shared", "host", "device"]) +def test_allocate_usm_ndarray(shape, usm_type): + try: + q = dpctl.SyclQueue() + except dpctl.SyclCreationError: + pytest.skip("Default SYCL queue could not be created") + X = dpt.usm_ndarray( + shape, dtype="d", buffer=usm_type, buffer_ctor_kwargs={"queue": q} + ) + Xnp = np.ndarray(shape, dtype="d") + assert X.usm_type == usm_type + assert X.sycl_context == q.sycl_context + assert X.sycl_device == q.sycl_device + assert X.size == Xnp.size + assert X.shape == Xnp.shape + + +@pytest.mark.parametrize( + "dtype", + [ + "u1", + "i1", + "u2", + "i2", + "u4", + "i4", + "u8", + "i8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +def test_dtypes(dtype): + dpt.usm_ndarray((1,), dtype=dtype) + + +def test_properties(): + """ + Test that properties execute + """ + X = dpt.usm_ndarray((3, 4, 5), dtype="c16") + assert isinstance(X.sycl_queue, dpctl.SyclQueue) + assert isinstance(X.sycl_device, dpctl.SyclDevice) + assert isinstance(X.sycl_context, dpctl.SyclContext) + assert isinstance(X.dtype, np.dtype) + assert isinstance(X.__sycl_usm_array_interface__, dict) + assert isinstance(X.T, dpt.usm_ndarray) + assert isinstance(X.imag, dpt.usm_ndarray) + assert isinstance(X.real, dpt.usm_ndarray) + assert isinstance(X.shape, tuple) + assert isinstance(X.strides, tuple) + assert X.usm_type in ("shared", "device", "host") + assert isinstance(X.size, numbers.Integral) + assert isinstance(X.nbytes, numbers.Integral) + assert isinstance(X.ndim, numbers.Integral) + + +@pytest.mark.parametrize( + "ind", + [ + tuple(), + (None,), + ( + None, + Ellipsis, + None, + ), + (2, 2, None, 3, 4), + (Ellipsis,), + (None, slice(0, None, 2), Ellipsis, slice(0, None, 3)), + ( + slice(None, None, -1), + slice(None, None, -1), + slice(0, None, 3), + slice(1, None, 2), + ), + ], +) +def test_basic_slice(ind): + X = dpt.usm_ndarray((2 * 3, 2 * 4, 3 * 5, 3 * 7), dtype="u1") + Xnp = np.empty(X.shape, dtype=X.dtype) + S = X[ind] + Snp = Xnp[ind] + assert S.shape == Snp.shape + assert S.strides == Snp.strides + assert S.dtype == X.dtype diff --git a/libtensor/include/usm_array.hpp b/libtensor/include/usm_array.hpp new file mode 100644 index 0000000000..bcaf15dc22 --- /dev/null +++ b/libtensor/include/usm_array.hpp @@ -0,0 +1,103 @@ +#pragma once + +#include "dpctl_sycl_types.h" +#include + +namespace usm_array +{ + +class strided_array +{ +public: + strided_array() {} + explicit strided_array(char *ptr, int nd, size_t *shape, int typenum) + : ptr_(ptr), nd_(nd), shape_(shape), typenum_(typenum){}; + explicit strided_array(char *ptr, + int nd, + size_t *shape, + std::ptrdiff_t *strides, + int typenum) + : ptr_(ptr), nd_(nd), shape_(shape), strides_(strides), + typenum_(typenum){}; + explicit strided_array(char *ptr, + int nd, + size_t *shape, + std::ptrdiff_t *strides, + int typenum, + int flags) + : ptr_(ptr), nd_(nd), shape_(shape), strides_(strides), + typenum_(typenum), flags_(flags){}; + strided_array(const strided_array &other) = default; + strided_array(strided_array &&other) = default; + ~strided_array() = default; + + // member access functions + char *get_data_ptr() const + { + return ptr_; + } + int ndim() const + { + return nd_; + } + size_t *get_shape_ptr() const + { + return shape_; + } + std::ptrdiff_t *get_strides_ptr() const + { + return strides_; + } + int typenum() const + { + return typenum_; + } + int flags() const + { + return flags_; + } + + size_t get_shape(int i) const + { + return shape_[i]; + } + std::ptrdiff_t get_stride(int i) const + { + return strides_[i]; + } + +private: + char *ptr_{0}; + int nd_{0}; + size_t *shape_{0}; + std::ptrdiff_t *strides_{0}; + int typenum_{0}; + int flags_{0}; +}; + +class usm_array : public strided_array +{ +public: + explicit usm_array(char *data, + int nd, + size_t *shape, + std::ptrdiff_t *strides, + int typenum, + int flags, + DPCTLSyclQueueRef qref) + : strided_array(data, nd, shape, strides, typenum, flags), q_(qref){}; + + usm_array(const usm_array &other) = default; + usm_array(usm_array &&other) = default; + ~usm_array() = default; + + DPCTLSyclQueueRef get_queue_ref() const + { + return q_; + } + +private: + DPCTLSyclQueueRef q_{0}; +}; + +} // namespace usm_array diff --git a/setup.py b/setup.py index 9d9a94fcdc..0ce77f32ea 100644 --- a/setup.py +++ b/setup.py @@ -150,14 +150,15 @@ def extensions(): dpctl_sycl_interface_include, ], "include_dirs": [np.get_include(), dpctl_sycl_interface_include], - "extra_compile_args": eca - + get_other_cxxflags() - + get_suppressed_warning_flags(), + "extra_compile_args": ( + eca + get_other_cxxflags() + get_suppressed_warning_flags() + ), "extra_link_args": ela, "libraries": libs, "library_dirs": librarys, "runtime_library_dirs": runtime_library_dirs, "language": "c++", + "define_macros": [], } if CODE_COVERAGE: @@ -175,69 +176,92 @@ def extensions(): [ os.path.join("dpctl", "_sycl_context.pyx"), ], - **extension_args + **extension_args, ), Extension( "dpctl._sycl_device", [ os.path.join("dpctl", "_sycl_device.pyx"), ], - **extension_args + **extension_args, ), Extension( "dpctl._sycl_device_factory", [ os.path.join("dpctl", "_sycl_device_factory.pyx"), ], - **extension_args + **extension_args, ), Extension( "dpctl._sycl_event", [ os.path.join("dpctl", "_sycl_event.pyx"), ], - **extension_args + **extension_args, ), Extension( "dpctl._sycl_platform", [ os.path.join("dpctl", "_sycl_platform.pyx"), ], - **extension_args + **extension_args, ), Extension( "dpctl._sycl_queue", [ os.path.join("dpctl", "_sycl_queue.pyx"), ], - **extension_args + **extension_args, ), Extension( "dpctl._sycl_queue_manager", [ os.path.join("dpctl", "_sycl_queue_manager.pyx"), ], - **extension_args + **extension_args, ), Extension( "dpctl.memory._memory", [ os.path.join("dpctl", "memory", "_memory.pyx"), ], - **extension_args + **extension_args, ), Extension( "dpctl.program._program", [ os.path.join("dpctl", "program", "_program.pyx"), ], - **extension_args + **extension_args, + ), + Extension( + "dpctl.tensor._usmarray", + [ + os.path.join("dpctl", "tensor", "_usmarray.pyx"), + ], + depends=extension_args["depends"] + + [os.path.join("libtensor", "include", "usm_array.hpp")], + language="c++", + include_dirs=( + extension_args["include_dirs"] + + [os.path.join("libtensor", "include")] + ), + extra_compile_args=extension_args["extra_compile_args"], + extra_link_args=extension_args["extra_link_args"], + libraries=extension_args["libraries"], + library_dirs=extension_args["library_dirs"], + runtime_library_dirs=extension_args["runtime_library_dirs"], + define_macros=extension_args["define_macros"], ), ] if CODE_COVERAGE: - exts = cythonize(extensions, compiler_directives={"linetrace": True}) + exts = cythonize( + extensions, + compiler_directives={"linetrace": True}, + language_level=3, + ) else: - exts = cythonize(extensions) + exts = cythonize(extensions, language_level=3) return exts From 69986fe01df5c5da7bf8d048899289c7d93522d8 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 6 May 2021 13:18:25 -0500 Subject: [PATCH 2/3] Removed all default constructors, added license, comments --- libtensor/include/usm_array.hpp | 46 +++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/libtensor/include/usm_array.hpp b/libtensor/include/usm_array.hpp index bcaf15dc22..dc40fc1c50 100644 --- a/libtensor/include/usm_array.hpp +++ b/libtensor/include/usm_array.hpp @@ -1,3 +1,27 @@ +//===----------- usm_array.hpp - class representing an array -*-C++-*- ===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2021 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines classes for strided_array, and usm_array +//===----------------------------------------------------------------------===// + #pragma once #include "dpctl_sycl_types.h" @@ -9,7 +33,16 @@ namespace usm_array class strided_array { public: - strided_array() {} + /* strided_array is data only class encapsulating information about + * type homogeneous nd-array. + * ptr : pointer to memory block storing array values + * nd : number of indices needed to reference an array element + * shape : pointer to C-array of length `nd` of array dimensions + * strides : pointer to C-array of length `nd` of memory displacements + * for unit increment of each index + * typenum : an integer (enum), encoding value type of array elements + * flags : field to encode additional array attributes + */ explicit strided_array(char *ptr, int nd, size_t *shape, int typenum) : ptr_(ptr), nd_(nd), shape_(shape), typenum_(typenum){}; explicit strided_array(char *ptr, @@ -27,9 +60,6 @@ class strided_array int flags) : ptr_(ptr), nd_(nd), shape_(shape), strides_(strides), typenum_(typenum), flags_(flags){}; - strided_array(const strided_array &other) = default; - strided_array(strided_array &&other) = default; - ~strided_array() = default; // member access functions char *get_data_ptr() const @@ -78,6 +108,10 @@ class strided_array class usm_array : public strided_array { public: + /* + * usm_array additionally carries DPCTLSyclQueueRef + * recording Sycl context the data USM pointer is bound to + */ explicit usm_array(char *data, int nd, size_t *shape, @@ -87,10 +121,6 @@ class usm_array : public strided_array DPCTLSyclQueueRef qref) : strided_array(data, nd, shape, strides, typenum, flags), q_(qref){}; - usm_array(const usm_array &other) = default; - usm_array(usm_array &&other) = default; - ~usm_array() = default; - DPCTLSyclQueueRef get_queue_ref() const { return q_; From bc15b77b44c7e3dc74ecb8ed85b85588d12be138 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 6 May 2021 14:52:13 -0500 Subject: [PATCH 3/3] added license headers --- dpctl/tensor/_slicing.pxi | 16 ++++++++++++++++ dpctl/tensor/_stride_utils.pxi | 16 ++++++++++++++++ dpctl/tensor/_types.pxi | 16 ++++++++++++++++ dpctl/tensor/_usmarray.pyx | 16 ++++++++++++++++ 4 files changed, 64 insertions(+) diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi index 9579567795..bde0a8da1d 100755 --- a/dpctl/tensor/_slicing.pxi +++ b/dpctl/tensor/_slicing.pxi @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numbers diff --git a/dpctl/tensor/_stride_utils.pxi b/dpctl/tensor/_stride_utils.pxi index 190f41d2ae..a2ef0740d0 100644 --- a/dpctl/tensor/_stride_utils.pxi +++ b/dpctl/tensor/_stride_utils.pxi @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # distutils: language = c++ # cython: language_level=3 diff --git a/dpctl/tensor/_types.pxi b/dpctl/tensor/_types.pxi index e88f79cc5c..b087be4c3a 100644 --- a/dpctl/tensor/_types.pxi +++ b/dpctl/tensor/_types.pxi @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index a69157ee82..537a539873 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -1,3 +1,19 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # distutils: language = c++ # cython: language_level=3