// SPDX-License-Identifier: BSD-3-Clause #include "sharpy/CollComm.hpp" namespace SHARPY { void bufferize(NDArray::ptr_type a_ptr, void *outPtr) { if (!outPtr) return; dispatch(a_ptr->dtype(), a_ptr->data(), [&a_ptr, outPtr](auto *ptr) { auto buff = static_cast(outPtr); auto shp = a_ptr->local_shape(); if (shp) { forall(0, ptr, shp, a_ptr->local_strides(), a_ptr->ndims(), [&buff](const auto *in) { *buff = *in; ++buff; }); } else { buff[0] = ptr[0]; } }); } // @param outPtr touched only if on root and/or root==REPLICATED or if not // distributed void gather_array(NDArray::ptr_type a_ptr, rank_type root, void *outPtr) { auto trscvr = a_ptr->transceiver(); if (!trscvr || a_ptr->owner() == REPLICATED) { bufferize(a_ptr, outPtr); return; } auto nranks = trscvr->nranks(); auto myrank = trscvr->rank(); bool sendonly = root != REPLICATED && root != myrank; auto dtype = a_ptr->dtype(); auto mysizes = a_ptr->local_shape(); auto mysz = mysizes[0]; auto myoff = a_ptr->local_offsets()[0]; auto nd = a_ptr->ndims(); auto gshape = a_ptr->shape(); auto myTileSz = std::accumulate(&gshape.data()[1], &gshape.data()[nd], 1, std::multiplies()); // allgather process local offset and sizes std::vector displacements(nranks); std::vector counts(nranks, 2); std::vector szsAndOffs(2 * nranks); for (size_t i = 0; i < nranks; ++i) { displacements[i] = i * 2; } szsAndOffs[2 * myrank + 0] = myoff; // FIXME split dim szsAndOffs[2 * myrank + 1] = mysz; trscvr->gather(szsAndOffs.data(), counts.data(), displacements.data(), INT32, REPLICATED); // compute each pranks local contribution int64_t curr = 0; for (auto i = 0ul; i < nranks; ++i) { assert(szsAndOffs[i * 2] * myTileSz == curr); displacements[i] = curr; counts[i] = szsAndOffs[i * 2 + 1] * myTileSz; curr += counts[i]; } // create buffer/numpy array and copy void *ptr = nullptr; bool need_del = false; if (sendonly) { if (mysz > 0 && a_ptr->is_sliced()) { ptr = new char[mysz * sizeof_dtype(dtype) * myTileSz]; bufferize(a_ptr, ptr); need_del = true; } else if (mysz > 0) { ptr = a_ptr->data(); } } else { ptr = outPtr; bufferize(a_ptr, &(static_cast( ptr)[displacements[myrank] * sizeof_dtype(dtype)])); } // final gather trscvr->gather(ptr, counts.data(), displacements.data(), dtype, root); if (need_del) delete[] static_cast(ptr); } // Compute offset and displacements when mapping n_slc to o_slc. This is // necessary when slices are not equally partitioned. // // We assume we split in first dimension. // We also assume partitions are assigned to ranks in sequence from 0-N. // With this we know that our buffers (old and new) get data in the // same order. The only thing which might have changed is the tile-size. // Actually, the tile-size might change only if old or new shape does not evenly // distribute data (e.g. last partition is smaller). // In theory we could re-shape in-place when the norm-tile-size does not change. // This is not implemented: we need an extra mechanism to work with // reshape-views or alike. std::vector> CollComm::map(const PVSlice &n_slc, const PVSlice &o_slc) { #if 0 auto nr = getTransceiver()->nranks(); std::vector counts_send(nr, 0); std::vector disp_send(nr, 0); std::vector counts_recv(nr, 0); std::vector disp_recv(nr, 0); // norm tile-size of orig array auto o_ntsz = o_slc.tile_size(0); // tilesize of my local partition of orig array auto o_tsz = o_slc.tile_size(); // linearized local slice of orig array auto o_llslc = Slice(o_ntsz * getTransceiver()->rank(), o_ntsz * getTransceiver()->rank() + o_tsz); // norm tile-size of new (reshaped) array auto n_ntsz = n_slc.tile_size(0); // tilesize of my local partition of new (reshaped) array auto n_tsz = n_slc.tile_size(); // linearized/flattened/1d local slice of new (reshaped) array auto n_llslc = Slice(n_ntsz * getTransceiver()->rank(), n_ntsz * getTransceiver()->rank() + n_tsz); for(auto r=0; r