diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 5a1c445889e..3c798866ba5 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -14,6 +14,54 @@ namespace vkcompute { namespace api { +PackedDimInfo::PackedDimInfo( + int32_t dim, + bool dim_padded, + int32_t outer_dim, + bool outer_dim_padded) + : packed_dim(dim), + packed_dim_padded(dim_padded), + outer_packed_dim(outer_dim), + outer_packed_dim_padded(outer_dim_padded), + is_block_packed(outer_dim != dim) { + if (!is_block_packed) { + VK_CHECK_COND(!outer_packed_dim_padded); + } +} + +PackedDimInfo calculate_packed_dim_info( + const utils::GPUMemoryLayout memory_layout, + const utils::StorageType storage_type) { + const int32_t packed_dim = utils::to_packed_dim(memory_layout); + + // Determine if packed dimension is padded + const bool packed_dim_padded = storage_type != utils::kBuffer || + memory_layout == utils::kPackedInt8_4W || + memory_layout == utils::kPackedInt8_4C || + memory_layout == utils::kPackedInt8_4H || + memory_layout == utils::kPackedInt8_4W4C || + memory_layout == utils::kPackedInt8_4H4W; + + // Determine outer packed dimension (for block-packed layouts) + int32_t outer_packed_dim; + if (memory_layout == utils::kPackedInt8_4W4C) { + outer_packed_dim = 0; // Width + } else if (memory_layout == utils::kPackedInt8_4H4W) { + outer_packed_dim = 1; // Height + } else { + outer_packed_dim = packed_dim; // No block packing + } + + // Determine if outer packed dimension is padded (only for block-packed + // layouts) + const bool outer_packed_dim_padded = + memory_layout == utils::kPackedInt8_4W4C || + memory_layout == utils::kPackedInt8_4H4W; + + return PackedDimInfo( + packed_dim, packed_dim_padded, outer_packed_dim, outer_packed_dim_padded); +} + /* * For PackedInt8 memory layouts, ensure that the scalar type used for the * tensor is kInt8x4. Otherwise, return the original scalar type. @@ -35,24 +83,28 @@ vkapi::ScalarType get_effective_scalar_type( */ std::vector calculate_sizes( const vkapi::VulkanImage& image, - const utils::GPUMemoryLayout memory_layout) { + const PackedDimInfo& packed_dim_info) { auto sizes = std::vector{ image.extents().width, image.extents().height, image.extents().depth}; - const auto packed_dim = utils::to_packed_dim(memory_layout); - sizes.at(packed_dim) *= 4; + sizes.at(packed_dim_info.packed_dim) *= 4; return sizes; } +/* + * Given a GPUMemoryLayout value, produce a dim order vector that matches the + * given memory layout. The produced dim order vector will be in the NCHW + * dimension order + */ std::vector calculate_dim_order( const size_t ndim, - const int32_t packed_dim) { + const PackedDimInfo& packed_dim_info) { // Special case for zero dim tensors if (ndim == 0) { return {0}; } std::vector dim_order(ndim); // Explicitly convert ndim to signed to prevent underflow - int64_t last_dim = int64_t(ndim) - 1 - packed_dim; + int64_t last_dim = int64_t(ndim) - 1 - packed_dim_info.packed_dim; int64_t cur_dim = 0; for (int d = 0; d < ndim; ++d) { @@ -69,24 +121,32 @@ std::vector calculate_dim_order( return dim_order; } +/* + * Given the sizes of a tensor and the dim order of the tensor (both in NCHW + * dimension order), calculate the strides of the tensor. + */ std::vector calculate_strides( - const std::vector& sizes, + const size_t ndim, + const std::vector& padded_sizes, const std::vector& dim_order) { // For zero dim tensors - if (sizes.size() == 0) { + if (ndim == 0) { return {1}; } - size_t ndim = sizes.size(); std::vector strides(ndim); + // padded_sizes has align_up_4(ndim) dimensions, with padding at the start + // We need to offset when indexing into padded_sizes + const int64_t offset = padded_sizes.size() - ndim; + strides[dim_order[ndim - 1]] = 1; for (int32_t i = ndim - 2; i >= 0; --i) { - if (sizes[dim_order[i + 1]] == 0) { + if (padded_sizes[dim_order[i + 1] + offset] == 0) { strides[dim_order[i]] = strides[dim_order[i + 1]]; } else { strides[dim_order[i]] = - strides[dim_order[i + 1]] * sizes[dim_order[i + 1]]; + strides[dim_order[i + 1]] * padded_sizes[dim_order[i + 1] + offset]; } } @@ -177,9 +237,24 @@ utils::ivec4 flip_and_unsqueeze_ivec4( }; } +/* + * When stored on the GPU, tensor data may be stored using texels (i.e. a vector + * of 4 scalar values) in order to take advantage of the GPU's native + * vectorization capabilities. Furthermore, tensor metadata is passed in to + * shaders as ivec4 types. + * + * To accommodate these vectorized types, the sizes of a tensor will be modified + * for GPU storage in the following ways: + * + * 1. The dimensionality of the tensor will be padded to a multiple of 4. + * 2. The size of the packed dimension will be padded to a multiple of 4. + * + * The "packed dimension" is determined based on the utils::GPUMemoryLayout + * argument. + */ std::vector calculate_padded_sizes( const std::vector& sizes, - const int32_t packed_dim) { + const PackedDimInfo& packed_dim_info) { int64_t ndim = sizes.size(); if (ndim == 0) { ndim = 1; @@ -192,21 +267,51 @@ std::vector calculate_padded_sizes( padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes); } - // Pad the packed dim to the next multiple of 4. - const int64_t dim_offset = packed_dim + 1; - const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes); - padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size); + // Pad the packed dim to the next multiple of 4 if specified. + // This is required for texture storage and packed layouts. + if (packed_dim_info.packed_dim_padded) { + const int64_t dim_offset = packed_dim_info.packed_dim + 1; + const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes); + padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size); + } + + // For block-packed layouts (e.g., 4W4C, 4H4W), also pad the outer packed + // dimension if it's different from the inner packed dimension and is marked + // as padded. + if (packed_dim_info.is_block_packed && + packed_dim_info.outer_packed_dim_padded) { + const int64_t outer_dim_offset = packed_dim_info.outer_packed_dim + 1; + const int64_t outer_padded_dim_size = + utils::val_at(-outer_dim_offset, sizes); + padded_sizes.at(ndim_up4 - outer_dim_offset) = + utils::align_up_4(outer_padded_dim_size); + } return padded_sizes; } +/* + * Calculate the image extents required of a texture backed tensor. + */ utils::uvec3 calculate_image_extents( + const vkapi::ScalarType dtype, + const PackedDimInfo& packed_dim_info, const std::vector& padded_sizes, - const utils::GPUMemoryLayout memory_layout, - const std::vector& axis_map, - const int32_t packed_dim) { + const std::vector& axis_map) { utils::uvec3 extents({1, 1, 1}); + const int64_t packed_dim_axis = axis_map.at(packed_dim_info.packed_dim); + const int64_t outer_packed_dim_axis = + axis_map.at(packed_dim_info.outer_packed_dim); + + // If the packed dim is not padded to the next multiple of 4, then that means + // this tensor is using buffer storage and does not require texture extents. + const int64_t packed_dim_idx = + padded_sizes.size() - 1 - packed_dim_info.packed_dim; + if (padded_sizes.at(packed_dim_idx) % 4 != 0) { + return extents; + } + // For high dimensional tensors, buffer storage must be used. No need to // compute image extents in this case. if (padded_sizes.size() > 4) { @@ -222,25 +327,26 @@ utils::uvec3 calculate_image_extents( } // For "regular" tensor dtypes, 4 elements along the packed dim are packed - // into one texel (4-component vectorized type). However, for packed int8 - // memory layouts, an additional level of packing is employed where 4 int8 - // elements are packed into one int32, and then 4 int32 are packed into each - // ivec4 texel. - if (utils::is_packed_int8_layout(memory_layout)) { - // Each int in the ivec4 contains 4 channels. The overall ivec4 contains - // data for a 1Hx4Wx4C block of the input tensor. - if (memory_layout == utils::kPackedInt8_4W4C) { - VK_CHECK_COND(packed_dim == 2); - extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u); + // into one texel (4-component vectorized type). However, for kInt8x4 dtype, + // an additional level of packing is employed where 4 int8 elements are + // packed into one int32, and then 4 int32 are packed into each ivec4 texel. + if (dtype == vkapi::kInt8x4) { + // For layouts with only one packed dimension, loading an ivec4 texel from + // the texture loads 16 int8 values (4 int32 that each contain 4 int8). + if (!packed_dim_info.is_block_packed) { + extents[packed_dim_axis] = utils::div_up(extents[packed_dim_axis], 16u); } - // Each int in the ivec4 contains 4 elements along the width dim. The - // overall ivec4 contains data for a 4Hx4W block of the input tensor. - else if (memory_layout == utils::kPackedInt8_4H4W) { - VK_CHECK_COND(packed_dim == 0); - extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u); - } else { - VK_THROW("Unhandled packed int8 memory layout!"); + // Layouts with two packed dimension (e.g., 4W4C, 4H4W) load a 4x4 block of + // data from two dimensions with each ivec4 texel load, as opposed to 16 + // adjacent values from a single dimension. + else { + VK_CHECK_COND(extents[outer_packed_dim_axis] % 4 == 0); + extents[outer_packed_dim_axis] /= 4; + VK_CHECK_COND(extents[packed_dim_axis] % 4 == 0); + extents[packed_dim_axis] /= 4; } + } else { + extents[packed_dim_axis] /= 4; } // axis_map[3] indicates the WHCN index of the dimension used for batch @@ -251,9 +357,6 @@ utils::uvec3 calculate_image_extents( // Multiply the extents of the batch axis by the batch size. extents[batch_axis] *= padded_sizes.at(0); - VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0); - extents[axis_map.at(packed_dim)] /= 4; - return extents; } @@ -285,73 +388,42 @@ utils::uvec3 calculate_logical_limits( * directly from tensor sizes. */ utils::uvec3 calculate_logical_limits( - const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout, - const std::vector& axis_map, - const int32_t packed_dim) { + const vkapi::ScalarType dtype, + const PackedDimInfo& packed_dim_info, + const std::vector& padded_sizes, + const std::vector& axis_map) { return calculate_logical_limits( - calculate_image_extents( - calculate_padded_sizes(sizes, packed_dim), - memory_layout, - axis_map, - packed_dim), + calculate_image_extents(dtype, packed_dim_info, padded_sizes, axis_map), axis_map); } +/* + * Calculate the number of elements that a GPU buffer would require to store the + * contents of a tensor. + */ int64_t calculate_gpu_buffer_numel( - const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout, - const vkapi::ScalarType dtype) { + const vkapi::ScalarType dtype, + const std::vector& padded_sizes) { size_t numel; - // Mirrors the logic in calculate_image_extents for packed int8 memory layouts + numel = utils::multiply_integers(padded_sizes); + + // For this dtype, the data buffer is interpreted as an array of int32, where + // each int32 contains 4xint8 values. To account for this, the number of + // elements needs to be divided by 4. if (dtype == vkapi::kInt8x4) { - VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout)); - std::vector blocks_in_dim = - flip_and_unsqueeze(sizes, kTensorSizes, 0); - // Each ivec4 contains data for a 1Hx4Wx4C block of the input - if (memory_layout == utils::kPackedInt8_4W4C) { - blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]); - blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]); - } - // Each ivec4 contains data for a 4Hx4W block of the input - else if (memory_layout == utils::kPackedInt8_4H4W) { - blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]); - blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]); - } else { - VK_THROW("Unhandled packed int8 memory layout!"); - } - // Each block is represented as an ivec4, and the base dtype of the buffer - // is int. Therefore, need to multiply the number of blocks by 4 to obtain - // the number of int elements in the data buffer. - numel = utils::multiply_integers(blocks_in_dim) * 4; + // Should already be a multiple of 4 due to padding the packed dimensions + VK_CHECK_COND(numel % 4 == 0); + numel /= 4; } - // Case for "regular" dtypes/memory layouts - else { - numel = utils::multiply_integers(sizes); - - // For 8-bit types, align to the next multiple of 4. For devices that do not - // support 8-bit storage buffers, the tensor data will be interpreted as an - // array of int32 instead. - if (vkapi::element_size(dtype) == 1) { - numel = utils::align_up_4(numel); - } - } - return numel; -} -int64_t calculate_staging_or_gpu_buffer_numel( - Context* const context, - const std::vector& sizes, - const utils::uvec3 image_extents, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, - const vkapi::ScalarType dtype) { - // For texture backed tensors, simply multiply the total number of texels by 4 - if (storage_type != utils::kBuffer) { - return image_extents[0] * image_extents[1] * image_extents[2] * 4; + // For 8-bit types, align to the next multiple of 4. For devices that do not + // support 8-bit storage buffers, the tensor data will be interpreted as an + // array of int32 instead. + if (vkapi::element_size(dtype) == 1) { + numel = utils::align_up_4(numel); } - return calculate_gpu_buffer_numel(sizes, memory_layout, dtype); + return numel; } template ::value>> @@ -365,13 +437,13 @@ int32_t pack_into_int32(const std::vector& vec, const int32_t extra) { int32_t create_hashed_layout( const std::vector& dim_order, const std::vector& axis_map, - const int32_t packed_dim, + const PackedDimInfo& packed_dim_info, const utils::StorageType storage_type) { if (storage_type == utils::kBuffer) { return pack_into_int32( flip_and_unsqueeze(dim_order, kTensorDimOrder, 0), 0); } - return pack_into_int32(axis_map, packed_dim); + return pack_into_int32(axis_map, packed_dim_info.packed_dim); } size_t calculate_max_ubo_nbytes( @@ -498,26 +570,20 @@ vkapi::VulkanBuffer allocate_buffer( vTensorStorage::vTensorStorage( Context* const context, const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, - const int32_t packed_dim, - const std::vector& sizes, + const PackedDimInfo& packed_dim_info, + const std::vector& padded_sizes, const vkapi::ScalarType dtype, + const int64_t physical_numel, const bool allocate_memory) : context_(context), storage_type_{storage_type}, image_extents_(calculate_image_extents( - calculate_padded_sizes(sizes, packed_dim), - memory_layout, - axis_map, - packed_dim)), - buffer_length_{calculate_staging_or_gpu_buffer_numel( - context_, - sizes, - image_extents_, - storage_type, - memory_layout, - dtype)}, + dtype, + packed_dim_info, + padded_sizes, + axis_map)), + buffer_length_{physical_numel}, buffer_offset_{0}, image_(allocate_image( context_, @@ -634,18 +700,20 @@ vTensor::vTensor( const utils::GPUMemoryLayout memory_layout, const bool allocate_memory, const utils::AxisMapLayout axis_map_layout) - : dtype_(get_effective_scalar_type(dtype, memory_layout)), + : packed_dim_info_(calculate_packed_dim_info(memory_layout, storage_type)), + dtype_(get_effective_scalar_type(dtype, memory_layout)), // Calculate tensor metadata sizes_(sizes.begin(), sizes.end()), - packed_dim_(utils::to_packed_dim(memory_layout)), - dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)), + padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)), + dim_order_(calculate_dim_order(sizes_.size(), packed_dim_info_)), axis_map_(calculate_axis_map(sizes_, axis_map_layout)), - strides_(calculate_strides(sizes, dim_order_)), + strides_(calculate_strides(sizes.size(), padded_sizes_, dim_order_)), numel_(utils::multiply_integers(sizes_)), + physical_numel_(calculate_gpu_buffer_numel(dtype_, padded_sizes_)), hashed_layout_(create_hashed_layout( dim_order_, axis_map_, - packed_dim_, + packed_dim_info_, storage_type)), // Related to tensor metadata UBOs min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, @@ -657,11 +725,11 @@ vTensor::vTensor( storage_(std::make_shared( context, storage_type, - memory_layout, axis_map_, - packed_dim_, - sizes, + packed_dim_info_, + padded_sizes_, dtype_, + physical_numel_, allocate_memory)) { // uniform_data_ only valid for low dim tensors if (sizes.size() <= 4) { @@ -683,18 +751,21 @@ vTensor::vTensor( const vkapi::VulkanImage& image, const utils::GPUMemoryLayout memory_layout, const utils::AxisMapLayout axis_map_layout) - : dtype_(vkapi::element_scalartype(image.format())), + : packed_dim_info_( + calculate_packed_dim_info(memory_layout, utils::kTexture3D)), + dtype_(vkapi::element_scalartype(image.format())), // Calculate tensor metadata - sizes_(calculate_sizes(image, memory_layout)), - packed_dim_(utils::to_packed_dim(memory_layout)), + sizes_(calculate_sizes(image, packed_dim_info_)), + padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)), dim_order_(), axis_map_(calculate_axis_map(sizes_, axis_map_layout)), strides_(), numel_(utils::multiply_integers(sizes_)), + physical_numel_(calculate_gpu_buffer_numel(dtype_, padded_sizes_)), hashed_layout_(create_hashed_layout( dim_order_, axis_map_, - packed_dim_, + packed_dim_info_, utils::kTexture3D)), // Related to tensor metadata UBOs min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, @@ -713,14 +784,16 @@ vTensor::vTensor( } vTensor::vTensor(vTensor& other) - : dtype_(other.dtype_), + : packed_dim_info_{other.packed_dim_info_}, + dtype_(other.dtype_), // Copy tensor size metadata sizes_(other.sizes_.begin(), other.sizes_.end()), - packed_dim_{other.packed_dim_}, + padded_sizes_(other.padded_sizes_.begin(), other.padded_sizes_.end()), dim_order_(other.dim_order_.begin(), other.dim_order_.end()), axis_map_(other.axis_map_.begin(), other.axis_map_.end()), strides_(other.strides_.begin(), other.strides_.end()), numel_(other.numel_), + physical_numel_(other.physical_numel_), hashed_layout_(other.hashed_layout_), min_nbytes_per_ubo_{other.min_nbytes_per_ubo_}, max_ubo_nbytes_{other.max_ubo_nbytes_}, @@ -735,18 +808,20 @@ vTensor::vTensor( vTensor& other, const std::vector& sizes, const std::vector& dim_order) - : dtype_(other.dtype_), + : packed_dim_info_(other.packed_dim_info_), + dtype_(other.dtype_), // Copy tensor size metadata sizes_(sizes.begin(), sizes.end()), - packed_dim_(other.packed_dim_), + padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)), dim_order_(dim_order.begin(), dim_order.end()), axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)), - strides_(calculate_strides(sizes_, dim_order_)), - numel_(other.numel_), + strides_(calculate_strides(sizes_.size(), padded_sizes_, dim_order_)), + numel_(utils::multiply_integers(sizes_)), + physical_numel_(calculate_gpu_buffer_numel(dtype_, padded_sizes_)), hashed_layout_(create_hashed_layout( dim_order_, axis_map_, - packed_dim_, + packed_dim_info_, other.storage_type())), min_nbytes_per_ubo_{other.min_nbytes_per_ubo_}, max_ubo_nbytes_{other.max_ubo_nbytes_}, @@ -755,11 +830,7 @@ vTensor::vTensor( // Copy Tensor storage storage_(other.storage_) { uniform_data_ = std::make_shared(UniformData{ - static_cast(utils::multiply_integers(sizes_)), - sizes_, - dim_order_, - strides_, - other.logical_limits()}); + numel_, sizes_, dim_order_, strides_, other.logical_limits()}); VK_CHECK_COND( dim_order_is_valid(dim_order_), "new dim order provided is invalid"); @@ -840,15 +911,15 @@ vTensor::TextureMetadata::TextureMetadata( const std::vector& src_sizes, const TextureLimits& src_logical_limits, const std::vector& src_axis_map, - const int32_t src_packed_dim) { - update(src_sizes, src_logical_limits, src_axis_map, src_packed_dim); + const PackedDimInfo& src_packed_dim_info) { + update(src_sizes, src_logical_limits, src_axis_map, src_packed_dim_info); } void vTensor::TextureMetadata::update( const std::vector& src_sizes, const TextureLimits& src_logical_limits, const std::vector& src_axis_map, - const int32_t src_packed_dim) { + const PackedDimInfo& src_packed_dim_info) { // Convert sizes to flipped and unsqueezed format (fixed to 4 dimensions for // texture) std::vector fu_sizes = @@ -877,7 +948,7 @@ void vTensor::TextureMetadata::update( axis_map[i] = 0; } - packed_dim = src_packed_dim; + packed_dim = src_packed_dim_info.packed_dim; } vkapi::VulkanImage& vTensor::image( @@ -911,17 +982,36 @@ vkapi::VulkanBuffer& vTensor::buffer( } utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { + // Check for block-packed layouts (two-level packing) - only applicable for + // kInt8x4 + if (dtype_ == vkapi::kInt8x4 && packed_dim_info_.is_block_packed) { + // For 4W4C: packed_dim = Channels, outer_packed_dim = Width + if (packed_dim_info_.packed_dim == WHCN::kChannelsDim && + packed_dim_info_.outer_packed_dim == WHCN::kWidthDim) { + return utils::kPackedInt8_4W4C; + } + // For 4H4W: packed_dim = Width, outer_packed_dim = Height + if (packed_dim_info_.packed_dim == WHCN::kWidthDim && + packed_dim_info_.outer_packed_dim == WHCN::kHeightDim) { + return utils::kPackedInt8_4H4W; + } + VK_THROW("Invalid block-packed layout configuration for kInt8x4 dtype"); + } + + // Single-level packing layouts if (dtype_ == vkapi::kInt8x4) { - switch (packed_dim_) { + switch (packed_dim_info_.packed_dim) { case WHCN::kChannelsDim: - return utils::kPackedInt8_4W4C; + return utils::kPackedInt8_4C; case WHCN::kWidthDim: - return utils::kPackedInt8_4H4W; + return utils::kPackedInt8_4W; + case WHCN::kHeightDim: + return utils::kPackedInt8_4H; default: VK_THROW("Invalid packed dim for Tensor with kInt8x4 type"); } } - switch (packed_dim_) { + switch (packed_dim_info_.packed_dim) { case WHCN::kWidthDim: return utils::kWidthPacked; case WHCN::kHeightDim: @@ -996,7 +1086,7 @@ const vkapi::BufferBindInfo vTensor::texture_meta_ubo() { size_t ubo_nbytes = sizeof(TextureMetadata); if (!texture_meta_.buffer()) { TextureLimits limits(logical_limits()); - TextureMetadata data(sizes_, limits, axis_map_, packed_dim_); + TextureMetadata data(sizes_, limits, axis_map_, packed_dim_info_); texture_meta_ = ParamsBuffer(storage_->context_, data); } return vkapi::BufferBindInfo(texture_meta_.buffer(), 0, ubo_nbytes); @@ -1049,7 +1139,8 @@ void vTensor::acquire_allocation(vkapi::Allocation&& allocation) { void vTensor::update_metadata() { numel_ = utils::multiply_integers(sizes_); - strides_ = calculate_strides(sizes_, dim_order_); + physical_numel_ = calculate_gpu_buffer_numel(dtype_, padded_sizes_); + strides_ = calculate_strides(sizes_.size(), padded_sizes_, dim_order_); // Update uniform data if it has been modified if (sizes_.size() <= 4) { @@ -1061,7 +1152,7 @@ void vTensor::update_metadata() { uniform_data_->strides_v = flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_); uniform_data_->logical_limits.limits = calculate_logical_limits( - sizes_, estimate_memory_layout(), axis_map_, packed_dim_); + dtype_, packed_dim_info_, padded_sizes_, axis_map_); if (sizes_uniform_offset_ != kUniformOffsetUnset) { uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_); @@ -1088,21 +1179,17 @@ void vTensor::update_metadata() { if (texture_meta_.buffer()) { TextureMetadata data( - sizes_, uniform_data_->logical_limits, axis_map_, packed_dim_); + sizes_, uniform_data_->logical_limits, axis_map_, packed_dim_info_); texture_meta_.update(data); } } void vTensor::check_sizes(const std::vector& sizes) const { - utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout(); if (storage_type() != utils::kBuffer) { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. utils::uvec3 virtual_extents = calculate_image_extents( - calculate_padded_sizes(sizes_, packed_dim_), - est_memory_layout, - axis_map_, - packed_dim_); + dtype_, packed_dim_info_, padded_sizes_, axis_map_); bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0]; valid_resize = @@ -1116,10 +1203,10 @@ void vTensor::check_sizes(const std::vector& sizes) const { } else { // For buffer storage check that the current buffer is large enough for // the new sizes of the tensor. - int64_t numel = - calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_); + int64_t gpu_buffer_numel = + calculate_gpu_buffer_numel(dtype_, padded_sizes_); bool valid_resize = - numel + storage_->buffer_offset_ <= storage_->buffer_length_; + gpu_buffer_numel + storage_->buffer_offset_ <= storage_->buffer_length_; VK_CHECK_COND( valid_resize, "tensor sizes requires a larger buffer than the current one."); @@ -1137,11 +1224,12 @@ void vTensor::virtual_reconfigure( check_sizes(new_sizes); sizes_ = new_sizes; + padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_); dim_order_ = new_dim_order; // Update the hashed layout because dim order is updated - hashed_layout_ = - create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); + hashed_layout_ = create_hashed_layout( + dim_order_, axis_map_, packed_dim_info_, storage_type()); update_metadata(); } @@ -1149,9 +1237,10 @@ void vTensor::virtual_reconfigure( void vTensor::virtual_clone(const vTensor& other) { VK_CHECK_COND(is_view_of(other)); sizes_ = other.sizes_; + padded_sizes_ = other.padded_sizes_; dim_order_ = other.dim_order_; axis_map_ = other.axis_map_; - packed_dim_ = other.packed_dim_; + packed_dim_info_ = other.packed_dim_info_; hashed_layout_ = other.hashed_layout_; *uniform_data_ = *other.get_uniform_data(); @@ -1164,6 +1253,7 @@ void vTensor::virtual_resize(const std::vector& new_sizes) { check_sizes(new_sizes); sizes_ = new_sizes; + padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_); update_metadata(); } @@ -1187,14 +1277,34 @@ void transpose_dim_order_inplace( } void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) { - std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1); - const int dim0_whcn = sizes_.size() - 1 - dim0; const int dim1_whcn = sizes_.size() - 1 - dim1; - if (packed_dim_ == dim0_whcn) { - packed_dim_ = dim1_whcn; - } else if (packed_dim_ == dim1_whcn) { - packed_dim_ = dim0_whcn; + + // For block-packed layouts, do not allow transposition if either packed_dim + // or outer_packed_dim is one of the dims being transposed + if (packed_dim_info_.is_block_packed) { + VK_CHECK_COND( + packed_dim_info_.packed_dim != dim0_whcn && + packed_dim_info_.packed_dim != dim1_whcn); + VK_CHECK_COND( + packed_dim_info_.outer_packed_dim != dim0_whcn && + packed_dim_info_.outer_packed_dim != dim1_whcn); + } + + std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1); + + // Update packed_dim and outer_packed_dim if they match one of the transposed + // dims + if (packed_dim_info_.packed_dim == dim0_whcn) { + packed_dim_info_.packed_dim = dim1_whcn; + } else if (packed_dim_info_.packed_dim == dim1_whcn) { + packed_dim_info_.packed_dim = dim0_whcn; + } + + if (packed_dim_info_.outer_packed_dim == dim0_whcn) { + packed_dim_info_.outer_packed_dim = dim1_whcn; + } else if (packed_dim_info_.outer_packed_dim == dim1_whcn) { + packed_dim_info_.outer_packed_dim = dim0_whcn; } if (storage_type() == utils::kBuffer) { @@ -1212,9 +1322,13 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) { } } - // Update the hashed layout because dim order / axis mpa is updated - hashed_layout_ = - create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); + // Update the hashed layout because dim order / axis map is updated + hashed_layout_ = create_hashed_layout( + dim_order_, axis_map_, packed_dim_info_, storage_type()); + + // Recalculate padded_sizes_ based on the new sizes and updated + // packed_dim_info + padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_); update_metadata(); } diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 967148b8dbe..b3c7184d2b3 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -22,48 +22,40 @@ namespace api { static constexpr size_t kTensorDimLimit = 8; /* - * Given a GPUMemoryLayout value, produce a dim order vector that matches the - * given memory layout. The produced dim order vector will be in the NCHW - * dimension order + * PackedDimInfo encapsulates metadata about packed dimensions in GPU tensors. + * This includes information about which dimension is packed, whether it's + * padded, and block packing information for special layouts like 4W4C and 4H4W. */ -std::vector calculate_dim_order( - const size_t ndim, - const int32_t packed_dim); - -/* - * Given the sizes of a tensor and the dim order of the tensor (both in NCHW) - * dimension order, calculate the strides of the tensor. - */ -std::vector calculate_strides( - const std::vector& sizes, - const std::vector& dim_order); - -/* - * When stored on the GPU, tensor data is stored using texels (i.e. a vector of - * 4 scalar values) in order to take advantage of the GPU's native vectorization - * capabilities. Furthermore, tensor metadata is passed in to shaders as ivec4 - * types. - * - * To accommodate these vectorized types, the sizes of a tensor will be modified - * for GPU storage in the following ways: - * - * 1. The dimensionality of the tensor will be padded to a multiple of 4. - * 2. The size of the packed dimension will be padded to a multiple of 4. - * - * The "packed dimension" is determined based on the utils::GPUMemoryLayout - * argument. - */ -std::vector calculate_padded_sizes( - const std::vector& sizes, - const int32_t packed_dim); - -/* - * Calculate the image extents required of a texture backed tensor. - */ -utils::uvec3 calculate_image_extents( - const std::vector& padded_sizes, - const std::vector& axis_map, - const int32_t packed_dim); +struct PackedDimInfo { + // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for + // width, 1 for height, etc.). For texture backed tensors, this describes + // which dimension is packed along a texel. For buffer backed tensors, this + // describes which dimension has a stride of 1 (i.e. is last in the dim + // order). + int32_t packed_dim; + // Describes if the packed dimension is padded to a multiple of 4. This will + // be true for all tensors that use texture storage, and will also be true + // for the PACKED_PADDED memory layouts. + bool packed_dim_padded; + // Describes a second level of packing, if applicable (which will only apply + // to the 4W4C and 4H4W layouts). If there is no second level of packing, + // then this will be equal to packed_dim. Otherwise, it will represent the + // outer dim used to construct block packing. For example, 4W4C will have + // packed_dim = 2 and outer_packed_dim = 0. + int32_t outer_packed_dim; + // Whether the outer packed dim is padded to the next multiple of 4. This is + // true only for block-packed layouts. + bool outer_packed_dim_padded; + // True if this layout uses block packing (i.e., outer_packed_dim != + // packed_dim). Block packing is used for layouts like 4W4C and 4H4W. + bool is_block_packed; + + PackedDimInfo( + int32_t dim, + bool dim_padded, + int32_t outer_dim, + bool outer_dim_padded); +}; struct LastAccess { vkapi::PipelineStageFlags stage; @@ -79,18 +71,6 @@ struct LastAccess { : stage{stage_flags}, access{access_flags} {} }; -/* - * Calculate the number of elements that a GPU buffer would require to store the - * contents of a tensor. This will depend on the storage type and dtype of the - * tensor, as well as the features available on the device. - */ -int64_t calculate_gpu_buffer_numel( - Context* const context, - const std::vector& sizes, - const utils::uvec3 image_extents, - const utils::StorageType storage_type, - const vkapi::ScalarType dtype); - class vTensorStorage final { public: // Do not allow empty vTensorStorage construction @@ -99,11 +79,11 @@ class vTensorStorage final { vTensorStorage( Context* context, const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, - const int32_t packed_dim, - const std::vector& sizes, + const PackedDimInfo& packed_dim_info, + const std::vector& padded_sizes, const vkapi::ScalarType dtype, + const int64_t physical_numel, const bool allocate_memory = true); vTensorStorage(Context* const context, const vkapi::VulkanImage& image); @@ -295,13 +275,13 @@ class vTensor final { const std::vector& sizes, const TextureLimits& logical_limits, const std::vector& axis_map, - const int32_t packed_dim); + const PackedDimInfo& packed_dim_info); void update( const std::vector& sizes, const TextureLimits& logical_limits, const std::vector& axis_map, - const int32_t packed_dim); + const PackedDimInfo& packed_dim_info); }; private: @@ -310,16 +290,14 @@ class vTensor final { * to construct a tensor. */ + // Information about packed dimension padding and block packing + PackedDimInfo packed_dim_info_; // Whether the tensor has elements of type float, int, etc. vkapi::ScalarType dtype_; // sizes of the tensor in NCHW dimension order std::vector sizes_; - // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for - // width, 1 for height, etc.). For texture backed tensors, this describes - // which dimension is packed along a texel. For buffer backed tensors, this - // describes which dimension has a stride of 1 (i.e. is last in the dim - // order). - int32_t packed_dim_; + // padded sizes of the tensor (pre-computed to avoid recalculation) + std::vector padded_sizes_; /* * "Layout" metadata. These describe with further detail how tensor data is @@ -353,6 +331,10 @@ class vTensor final { // number of elements based on the canonical sizes size_t numel_; + // number of elements required for GPU buffer storage (with padding/packing) + // This is pre-computed to avoid recomputing calculate_gpu_buffer_numel + int64_t physical_numel_; + // For texture backed tensors, this int32 contains the axis map data packed // into a single int32. For buffer backed tensors, this int32 contains the // wchn dim order data packed into a single int32. @@ -483,7 +465,11 @@ class vTensor final { utils::GPUMemoryLayout estimate_memory_layout() const; inline int32_t packed_dim() const { - return packed_dim_; + return packed_dim_info_.packed_dim; + } + + inline const PackedDimInfo& packed_dim_info() const { + return packed_dim_info_; } /* @@ -514,10 +500,22 @@ class vTensor final { return strides_; } + inline const std::vector& padded_sizes() const { + return padded_sizes_; + } + inline size_t numel() const { return numel_; } + inline int64_t physical_numel() const { + return physical_numel_; + } + + inline utils::uvec3 image_extents() const { + return storage_->image_extents_; + } + inline size_t nbytes() const { return element_size(dtype()) * numel(); } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 18e97d7b516..5b0e66030c8 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -362,6 +362,10 @@ class ComputeGraph final { return values_.at(idx).toConstTensor().staging_buffer_numel(); } + inline int64_t physical_numel_of(const ValueRef idx) const { + return values_.at(idx).toConstTensor().physical_numel(); + } + inline utils::StorageType storage_type_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().storage_type(); } @@ -442,6 +446,11 @@ class ComputeGraph final { return values_.at(idx).toConstTensor().packed_dim(); } + inline const api::PackedDimInfo& packed_dim_info_of( + const ValueRef idx) const { + return values_.at(idx).toConstTensor().packed_dim_info(); + } + inline int32_t concat_dim_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().concat_dim(); } diff --git a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp index 0a4acb6cef3..1923757afbd 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp @@ -108,17 +108,19 @@ utils::uvec3 concat_pick_global_wg_size( // Calculate what the image extents would be of a tensor with the input // volume's sizes. This produces the number of texels that would need to be // written to. - const int32_t packed_dim = graph->packed_dim_of(out); + + const int32_t packed_dim_idx = graph->packed_dim_of(out); std::vector inp_volume_texel_sizes = - api::calculate_padded_sizes(inp_volume_sizes, packed_dim); + api::flip_and_unsqueeze(inp_volume_sizes, api::kTensorSizes, 1); + // If the concat_dim is the same as the packed dim, and the concat_offset for // this input batch is not a multiple of 4, then the data from an input texel // may be split up between two output texels. For example: // I0 , I1 , I2 , I2 // O0 , O1 , O2 , X | X , X , X , X // Therefore, 1 texel is added to the packed dim to account for this. - inp_volume_texel_sizes.at(3 - packed_dim) = - utils::div_up_4(inp_volume_texel_sizes.at(3 - packed_dim)) + 1; + inp_volume_texel_sizes.at(packed_dim_idx) = + utils::div_up_4(inp_volume_texel_sizes.at(packed_dim_idx)) + 1; const uint32_t inp_volume_texel_numel = utils::multiply_integers(inp_volume_texel_sizes); @@ -324,7 +326,7 @@ void add_concat_node( {1u, 1u, 1u}, {1u, 1u, 1u}, // Inputs and Outputs - {{concat_offset, vkapi::kWrite}}, + {{concat_offset, vkapi::kReadWrite}}, // Parameter buffers param_buffers, // Push Constants diff --git a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp index 687b3923354..223f082d6a6 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp @@ -34,7 +34,11 @@ void add_tan_node(ComputeGraph& graph, const ValueRef in, const ValueRef out) { add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); vkapi::ParamsBindList ubos({}); - ubos.append({graph.logical_limits_ubo(out)}); + if (graph.storage_type_of(out) == utils::kBuffer) { + ubos.append({graph.numel_ubo(out)}); + } else { + ubos.append({graph.logical_limits_ubo(out)}); + } graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, diff --git a/backends/vulkan/runtime/utils/StorageUtils.cpp b/backends/vulkan/runtime/utils/StorageUtils.cpp index cfe3d9e159a..767c1294c39 100644 --- a/backends/vulkan/runtime/utils/StorageUtils.cpp +++ b/backends/vulkan/runtime/utils/StorageUtils.cpp @@ -13,6 +13,9 @@ namespace utils { bool is_packed_int8_layout(const GPUMemoryLayout layout) { switch (layout) { + case kPackedInt8_4W: + case kPackedInt8_4C: + case kPackedInt8_4H: case kPackedInt8_4W4C: case kPackedInt8_4H4W: return true; diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h index a269adccecb..45b1529f5b0 100644 --- a/backends/vulkan/runtime/utils/StorageUtils.h +++ b/backends/vulkan/runtime/utils/StorageUtils.h @@ -101,6 +101,14 @@ enum class GPUMemoryLayout : uint8_t { * 16 element block is loaded, rather than 4 elements along one dimension. */ + // "vector" packed layouts - single level of packing (4 elements along packed + // dim per int32) + TENSOR_PACKED_INT8_4W = 5u, + TENSOR_PACKED_INT8_4C = 6u, + TENSOR_PACKED_INT8_4H = 7u, + + // Block packed layouts - two levels of packing (4x4 block composed of + // elements from two packed dims per ivec4) TENSOR_PACKED_INT8_4W4C = 3u, TENSOR_PACKED_INT8_4H4W = 4u, }; @@ -114,6 +122,15 @@ static constexpr GPUMemoryLayout kHeightPacked = static constexpr GPUMemoryLayout kChannelsPacked = GPUMemoryLayout::TENSOR_CHANNELS_PACKED; +static constexpr GPUMemoryLayout kPackedInt8_4W = + GPUMemoryLayout::TENSOR_PACKED_INT8_4W; + +static constexpr GPUMemoryLayout kPackedInt8_4C = + GPUMemoryLayout::TENSOR_PACKED_INT8_4C; + +static constexpr GPUMemoryLayout kPackedInt8_4H = + GPUMemoryLayout::TENSOR_PACKED_INT8_4H; + static constexpr GPUMemoryLayout kPackedInt8_4W4C = GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C; @@ -129,6 +146,12 @@ T to_packed_dim(const GPUMemoryLayout layout) { return 1; case kChannelsPacked: return 2; + case kPackedInt8_4W: + return 0; + case kPackedInt8_4C: + return 2; + case kPackedInt8_4H: + return 1; case kPackedInt8_4W4C: return 2; case kPackedInt8_4H4W: @@ -170,6 +193,15 @@ inline std::ostream& operator<<( case kChannelsPacked: os << "TENSOR_CHANNELS_PACKED"; break; + case kPackedInt8_4W: + os << "TENSOR_PACKED_INT8_4W"; + break; + case kPackedInt8_4C: + os << "TENSOR_PACKED_INT8_4C"; + break; + case kPackedInt8_4H: + os << "TENSOR_PACKED_INT8_4H"; + break; case kPackedInt8_4W4C: os << "TENSOR_PACKED_INT8_4W4C"; break; diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 03619ec54af..024c3a086a8 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -112,6 +112,70 @@ TEST_F(VulkanComputeAPITest, print_shader_executable_properties) { #endif // VK_KHR_pipeline_executable_properties && ETVK_INSPECT_PIPELINES +std::vector get_reference_dim_order( + const size_t ndim, + const int32_t packed_dim) { + // Special case for zero dim tensors + if (ndim == 0) { + return {0}; + } + std::vector dim_order(ndim); + // Explicitly convert ndim to signed to prevent underflow + int64_t last_dim = int64_t(ndim) - 1 - packed_dim; + + int64_t cur_dim = 0; + for (int d = 0; d < ndim; ++d) { + if (d == last_dim) { + cur_dim++; + } + dim_order[d] = cur_dim; + cur_dim++; + } + if (last_dim >= 0) { + dim_order[ndim - 1] = last_dim; + } + + return dim_order; +} + +std::vector get_reference_padded_sizes( + const std::vector& sizes, + const int32_t packed_dim, + const bool packed_dim_padded, + const int32_t outer_packed_dim = -1, + const bool outer_packed_dim_padded = false) { + int64_t ndim = sizes.size(); + if (ndim == 0) { + ndim = 1; + } + + // Tensor sizes will be unsqueezed up to the next multiple of 4 + const int64_t ndim_up4 = utils::align_up_4(ndim); + std::vector padded_sizes(ndim_up4); + for (int64_t i = 0; i < ndim_up4; ++i) { + padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes); + } + + // Pad the packed dim to the next multiple of 4 if specified + if (packed_dim_padded) { + const int64_t dim_offset = packed_dim + 1; + const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes); + padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size); + } + + // For block-packed layouts, also pad the outer packed dimension if specified + if (outer_packed_dim >= 0 && outer_packed_dim != packed_dim && + outer_packed_dim_padded) { + const int64_t outer_dim_offset = outer_packed_dim + 1; + const int64_t outer_padded_dim_size = + utils::val_at(-outer_dim_offset, sizes); + padded_sizes.at(ndim_up4 - outer_dim_offset) = + utils::align_up_4(outer_padded_dim_size); + } + + return padded_sizes; +} + std::vector get_reference_strides( const std::vector& sizes, const utils::GPUMemoryLayout layout, @@ -194,39 +258,100 @@ std::vector get_reference_strides( return {}; } -/* - * Applies the following transformations to a tensor's dim_order vector: - * 1. Reverse the order of elements so that the fastest moving dimensions are - * first. - * 2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the - * width dimension, 1 represents the height dimension, and 2 represents the - * channels dimension. - * 3. Unsqueeze the dim_order vector to the next multiple of 4. - */ -std::vector create_whcn_dim_order( - const std::vector& dim_order) { - size_t ndim = dim_order.size(); - std::vector whcn_order(ndim); +int64_t get_reference_physical_numel( + const vkapi::ScalarType dtype, + const std::vector& padded_sizes) { + size_t numel = utils::multiply_integers(padded_sizes); - // Convert from NCHW to WHCN index, and flip the dim order so that the fastest - // moving dimension is first. - // example: { 1, 2, 0} -> { 2, 0, 1} - // {height, width, channels} -> {channels, width, height} - for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim; - ++whcn_i, --nchw_i) { - whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i); + // For kInt8x4, the data buffer is interpreted as an array of int32, where + // each int32 contains 4xint8 values. To account for this, the number of + // elements needs to be divided by 4. + if (dtype == vkapi::kInt8x4) { + // Should already be a multiple of 4 due to padding + if (numel % 4 != 0) { + VK_THROW("Expected numel to be multiple of 4 for kInt8x4"); + } + numel /= 4; } - // Unsqueeze to the next multiple of 4 - size_t ndim_up4 = utils::align_up_4(ndim); - whcn_order.resize(ndim_up4); + // For 8-bit types, align to the next multiple of 4. For devices that do not + // support 8-bit storage buffers, the tensor data will be interpreted as an + // array of int32 instead. + if (vkapi::element_size(dtype) == 1) { + numel = utils::align_up_4(numel); + } + return numel; +} + +utils::uvec3 get_reference_image_extents( + const vkapi::ScalarType dtype, + const int32_t packed_dim, + const int32_t outer_packed_dim, + const bool is_block_packed, + const std::vector& padded_sizes, + const std::vector& axis_map) { + utils::uvec3 extents({1, 1, 1}); - // Append unsqueezed dimensions - for (size_t i = ndim; i < ndim_up4; ++i) { - whcn_order.at(i) = i; + const int64_t packed_dim_axis = axis_map.at(packed_dim); + const int64_t outer_packed_dim_axis = axis_map.at(outer_packed_dim); + + // If the packed dim is not padded to the next multiple of 4, then that means + // this tensor is using buffer storage and does not require texture extents. + const int64_t packed_dim_idx = padded_sizes.size() - 1 - packed_dim; + if (padded_sizes.at(packed_dim_idx) % 4 != 0) { + return extents; + } + + // For high dimensional tensors, buffer storage must be used. No need to + // compute image extents in this case. + if (padded_sizes.size() > 4) { + return extents; + } + + // First three elements of axis_map indicate which (X,Y,Z) image axis the + // width, height, and channels dim of the tensor maps to. + for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) { + const int64_t axis = axis_map.at(whcn_dim); + const int64_t dim = padded_sizes.size() - 1 - whcn_dim; + extents[axis] = utils::safe_downcast(padded_sizes.at(dim)); + } + + // For "regular" tensor dtypes, 4 elements along the packed dim are packed + // into one texel (4-component vectorized type). However, for kInt8x4 dtype, + // an additional level of packing is employed where 4 int8 elements are + // packed into one int32, and then 4 int32 are packed into each ivec4 texel. + if (dtype == vkapi::kInt8x4) { + // For layouts with only one packed dimension, loading an ivec4 texel from + // the texture loads 16 int8 values (4 int32 that each contain 4 int8). + if (!is_block_packed) { + extents[packed_dim_axis] = utils::div_up(extents[packed_dim_axis], 16u); + } + // Layouts with two packed dimension (e.g., 4W4C, 4H4W) load a 4x4 block of + // data from two dimensions with each ivec4 texel load, as opposed to 16 + // adjacent values from a single dimension. + else { + if (extents[outer_packed_dim_axis] % 4 != 0) { + VK_THROW("Expected outer_packed_dim_axis extent to be multiple of 4"); + } + extents[outer_packed_dim_axis] /= 4; + if (extents[packed_dim_axis] % 4 != 0) { + VK_THROW("Expected packed_dim_axis extent to be multiple of 4"); + } + extents[packed_dim_axis] /= 4; + } + } else { + extents[packed_dim_axis] /= 4; } - return whcn_order; + // axis_map[3] indicates the WHCN index of the dimension used for batch + // concatenation. Thus a double lookup is required to determine the image axis + // used for batch concatenation. + const int64_t concatted_whcn_dim = axis_map.at(3); + const int64_t batch_axis = axis_map.at(concatted_whcn_dim); + // Multiply the extents of the batch axis by the batch size. + extents[batch_axis] *= padded_sizes.at(0); + + return extents; } TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) { @@ -250,89 +375,405 @@ bool compare_vectors( return true; } -TEST_F(VulkanComputeAPITest, calculate_dim_order_test) { - // ndim, GPUMemoryLayout, expected dim order pairs - std::vector>> test_cases = { - {1, WHCN::kWidthDim, {0}}, - {1, WHCN::kHeightDim, {0}}, - {1, WHCN::kChannelsDim, {0}}, - {2, WHCN::kWidthDim, {0, 1}}, - {2, WHCN::kHeightDim, {1, 0}}, - {2, WHCN::kChannelsDim, {0, 1}}, - {3, WHCN::kWidthDim, {0, 1, 2}}, - {3, WHCN::kHeightDim, {0, 2, 1}}, - {3, WHCN::kChannelsDim, {1, 2, 0}}, - {4, WHCN::kWidthDim, {0, 1, 2, 3}}, - {4, WHCN::kHeightDim, {0, 1, 3, 2}}, - {4, WHCN::kChannelsDim, {0, 2, 3, 1}}, - }; +TEST_F(VulkanComputeAPITest, tensor_layout_metadata_test) { + // Test all combinations of tensor sizes, storage types, and memory layouts + // to ensure that layout metadata is computed correctly - for (const auto& test_case : test_cases) { - const size_t& ndim = std::get<0>(test_case); - const int32_t packed_dim = std::get<1>(test_case); - const auto& expected_dim_order = std::get<2>(test_case); - std::vector dim_order = calculate_dim_order(ndim, packed_dim); + // Define test configuration for each layout type + struct LayoutTestConfig { + utils::GPUMemoryLayout layout; + vkapi::ScalarType dtype; + int32_t packed_dim; + int32_t outer_packed_dim; + bool is_block_packed; + }; - ASSERT_TRUE(dim_order == expected_dim_order); - } -} + std::vector layout_configs = { + // Standard layouts with float dtype + {utils::kWidthPacked, + vkapi::kFloat, + WHCN::kWidthDim, + WHCN::kWidthDim, + false}, + {utils::kHeightPacked, + vkapi::kFloat, + WHCN::kHeightDim, + WHCN::kHeightDim, + false}, + {utils::kChannelsPacked, + vkapi::kFloat, + WHCN::kChannelsDim, + WHCN::kChannelsDim, + false}, + + // Packed int8 vector layouts (single-dimension packed) + // Use kChar, which should be converted to kInt8x4 + {utils::kPackedInt8_4W, + vkapi::kChar, + WHCN::kWidthDim, + WHCN::kWidthDim, + false}, + {utils::kPackedInt8_4C, + vkapi::kChar, + WHCN::kChannelsDim, + WHCN::kChannelsDim, + false}, + {utils::kPackedInt8_4H, + vkapi::kChar, + WHCN::kHeightDim, + WHCN::kHeightDim, + false}, + + // Packed int8 block layouts (two-dimension packed) + // Use kChar, which should be converted to kInt8x4 + {utils::kPackedInt8_4W4C, + vkapi::kChar, + WHCN::kChannelsDim, + WHCN::kWidthDim, + true}, + {utils::kPackedInt8_4H4W, + vkapi::kChar, + WHCN::kWidthDim, + WHCN::kHeightDim, + true}, + }; -TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { - vTensor v_tensor_to_resize( - context(), - {25, 25, 25, 25}, - vkapi::kFloat, - utils::kBuffer, - utils::kWidthPacked, - /*allocate_memory = */ false); + std::vector storage_types = { + utils::kBuffer, utils::kTexture3D}; for (const auto& sizes : standard_sizes_to_test) { - if (sizes.size() < 3) { - continue; + if (sizes.size() < 2) { + continue; // Skip 1D tensors } - for (const auto& layout : - {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) { - { - const int32_t packed_dim = static_cast(layout); - std::vector dim_order = - calculate_dim_order(sizes.size(), packed_dim); - std::vector strides = calculate_strides(sizes, dim_order); - int64_t numel = utils::multiply_integers(sizes); - std::vector ref_strides = get_reference_strides(sizes, layout); - ASSERT_TRUE(strides == ref_strides); + for (const auto& storage_type : storage_types) { + for (const auto& config : layout_configs) { + // Skip block-packed layouts for tensors with less than 3 dimensions + if (config.is_block_packed && sizes.size() < 3) { + continue; + } + + // Create tensor + vTensor tensor( + context(), + sizes, + config.dtype, + storage_type, + config.layout, + /*allocate_memory = */ false); - std::vector unsqueezed_strides = - flip_and_unsqueeze(strides, kTensorStrides, numel); + // Verify sizes + ASSERT_TRUE(tensor.sizes() == sizes) + << "Sizes mismatch for layout=" << static_cast(config.layout) + << ", storage=" << static_cast(storage_type); - std::vector ref_unsqueezed_strides = - get_reference_strides(sizes, layout, true); + // Verify dtype + // For packed int8 layouts, kChar should be converted to kInt8x4 + vkapi::ScalarType expected_dtype = config.dtype; + if (config.dtype == vkapi::kChar) { + expected_dtype = vkapi::kInt8x4; + } + ASSERT_EQ(tensor.dtype(), expected_dtype) + << "Dtype mismatch for layout=" << static_cast(config.layout) + << ", expected=" << static_cast(expected_dtype) + << ", got=" << static_cast(tensor.dtype()); + + // Determine if packed_dim should be padded + // For packed int8 layouts (using kChar which converts to kInt8x4), + // always padded For texture storage, always padded For buffer storage + // with standard layouts, not padded + const bool expected_packed_dim_padded = + (config.dtype == vkapi::kChar) || (storage_type != utils::kBuffer); + + // For block-packed layouts, outer_packed_dim is also padded + const bool expected_outer_packed_dim_padded = config.is_block_packed; + + // Verify packed_dim_info + const auto& packed_dim_info = tensor.packed_dim_info(); + ASSERT_EQ(packed_dim_info.packed_dim, config.packed_dim) + << "packed_dim mismatch for layout=" + << static_cast(config.layout); + ASSERT_EQ(packed_dim_info.packed_dim_padded, expected_packed_dim_padded) + << "packed_dim_padded mismatch for layout=" + << static_cast(config.layout); + ASSERT_EQ(packed_dim_info.outer_packed_dim, config.outer_packed_dim) + << "outer_packed_dim mismatch for layout=" + << static_cast(config.layout); + ASSERT_EQ( + packed_dim_info.outer_packed_dim_padded, + expected_outer_packed_dim_padded) + << "outer_packed_dim_padded mismatch for layout=" + << static_cast(config.layout); + ASSERT_EQ(packed_dim_info.is_block_packed, config.is_block_packed) + << "is_block_packed mismatch for layout=" + << static_cast(config.layout); + + // Verify dim_order + std::vector ref_dim_order = + get_reference_dim_order(sizes.size(), config.packed_dim); + ASSERT_TRUE(tensor.dim_order() == ref_dim_order) + << "Dim order mismatch for layout=" + << static_cast(config.layout); + + // Verify padded_sizes + std::vector ref_padded_sizes = get_reference_padded_sizes( + sizes, + config.packed_dim, + expected_packed_dim_padded, + config.outer_packed_dim, + expected_outer_packed_dim_padded); + ASSERT_TRUE(tensor.padded_sizes() == ref_padded_sizes) + << "Padded sizes mismatch for layout=" + << static_cast(config.layout); + + if (storage_type == utils::kBuffer) { + // For buffer tensors, verify strides (only for standard layouts) + // For int8 layouts, we rely on padded_sizes and dim_order + // verification + if (config.dtype == vkapi::kFloat) { + std::vector ref_strides = + get_reference_strides(sizes, config.layout); + ASSERT_TRUE(tensor.strides() == ref_strides) + << "Strides mismatch for layout=" + << static_cast(config.layout); + + // Also test flip_and_unsqueeze operations + int64_t numel = utils::multiply_integers(sizes); + std::vector unsqueezed_strides = + flip_and_unsqueeze( + tensor.strides(), kTensorStrides, numel); + std::vector ref_unsqueezed_strides = + get_reference_strides(sizes, config.layout, true); + ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides); + } + + // Verify physical_numel for buffer storage + int64_t ref_physical_numel = + get_reference_physical_numel(expected_dtype, ref_padded_sizes); + ASSERT_EQ(tensor.physical_numel(), ref_physical_numel) + << "Physical numel mismatch for buffer storage with layout=" + << static_cast(config.layout); + } else { + // For texture tensors, verify axis_map + std::vector expected_axis_map = {0, 1, 2, 2}; + ASSERT_TRUE(tensor.axis_map() == expected_axis_map) + << "Axis map mismatch for texture tensor with layout=" + << static_cast(config.layout); + ASSERT_TRUE(tensor.has_standard_axis_map()); + + // Verify image_extents for texture storage + utils::uvec3 ref_image_extents = get_reference_image_extents( + expected_dtype, + config.packed_dim, + config.outer_packed_dim, + config.is_block_packed, + ref_padded_sizes, + expected_axis_map); + ASSERT_EQ(tensor.image_extents(), ref_image_extents) + << "Image extents mismatch for texture storage with layout=" + << static_cast(config.layout); + } + } + } + } +} - ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides); +TEST_F(VulkanComputeAPITest, tensor_layout_metadata_test_against_golden) { + // Test with hardcoded golden values for specific test cases. + // This complements the reference implementation test by providing concrete + // examples with known-good values. + + struct TestCase { + std::vector sizes; + vkapi::ScalarType dtype; + utils::GPUMemoryLayout layout; + // Expected values for both buffer and texture storage + std::vector expected_dim_order; + std::vector expected_padded_sizes_buffer; + std::vector expected_padded_sizes_texture; + std::vector expected_strides_buffer; + int64_t expected_physical_numel_buffer; + int64_t expected_physical_numel_texture; + utils::uvec3 expected_image_extents; + }; - std::vector whcn_dim_order = - flip_and_unsqueeze(dim_order, kTensorDimOrder, numel); + std::vector test_cases = { + // 2D tensor [5, 7] with width packed, float dtype + {/* sizes */ {5, 7}, + /* dtype */ vkapi::kFloat, + /* layout */ utils::kWidthPacked, + /* expected_dim_order */ {0, 1}, + /* expected_padded_sizes_buffer */ {1, 1, 5, 7}, + /* expected_padded_sizes_texture */ {1, 1, 5, 8}, + /* expected_strides_buffer */ {7, 1}, + /* expected_physical_numel_buffer */ 35, + /* expected_physical_numel_texture */ 40, + /* expected_image_extents */ {2, 5, 1}}, + + // 3D tensor [3, 5, 7] with channels packed, float dtype + {/* sizes */ {3, 5, 7}, + /* dtype */ vkapi::kFloat, + /* layout */ utils::kChannelsPacked, + /* expected_dim_order */ {1, 2, 0}, + /* expected_padded_sizes_buffer */ {1, 3, 5, 7}, + /* expected_padded_sizes_texture */ {1, 4, 5, 7}, + /* expected_strides_buffer */ {1, 7 * 3, 3}, + /* expected_physical_numel_buffer */ 105, + /* expected_physical_numel_texture */ 140, + /* expected_image_extents */ {7, 5, 1}}, + + // 4D tensor [2, 3, 5, 7] with height packed, float dtype + {/* sizes */ {2, 3, 5, 7}, + /* dtype */ vkapi::kFloat, + /* layout */ utils::kHeightPacked, + /* expected_dim_order */ {0, 1, 3, 2}, + /* expected_padded_sizes_buffer */ {2, 3, 5, 7}, + /* expected_padded_sizes_texture */ {2, 3, 8, 7}, + /* expected_strides_buffer */ {3 * 5 * 7, 5 * 7, 1, 5}, + /* expected_physical_numel_buffer */ 210, + /* expected_physical_numel_texture */ 336, + /* expected_image_extents */ {7, 2, 6}}, + + // 3D tensor [8, 12, 16] with packed int8 4W layout + {/* sizes */ {8, 12, 16}, + /* dtype */ vkapi::kChar, + /* layout */ utils::kPackedInt8_4W, + /* expected_dim_order */ {0, 1, 2}, + /* expected_padded_sizes_buffer */ {1, 8, 12, 16}, + /* expected_padded_sizes_texture */ {1, 8, 12, 16}, + /* expected_strides_buffer */ {}, + /* expected_physical_numel_buffer */ 384, + /* expected_physical_numel_texture */ 384, + /* expected_image_extents */ {1, 12, 8}}, + + // 3D tensor [8, 12, 16] with packed int8 4W4C block layout + {/* sizes */ {8, 12, 16}, + /* dtype */ vkapi::kChar, + /* layout */ utils::kPackedInt8_4W4C, + /* expected_dim_order */ {1, 2, 0}, + /* expected_padded_sizes_buffer */ {1, 8, 12, 16}, + /* expected_padded_sizes_texture */ {1, 8, 12, 16}, + /* expected_strides_buffer */ {}, + /* expected_physical_numel_buffer */ 384, + /* expected_physical_numel_texture */ 384, + /* expected_image_extents */ {4, 12, 2}}, + + // 3D tensor [9, 13, 17] with packed int8 4C layout (odd sizes) + {/* sizes */ {9, 13, 17}, + /* dtype */ vkapi::kChar, + /* layout */ utils::kPackedInt8_4C, + /* expected_dim_order */ {1, 2, 0}, + /* expected_padded_sizes_buffer */ {1, 12, 13, 17}, + /* expected_padded_sizes_texture */ {1, 12, 13, 17}, + /* expected_strides_buffer */ {}, + /* expected_physical_numel_buffer */ 663, + /* expected_physical_numel_texture */ 663, + /* expected_image_extents */ {17, 13, 1}}, + + // 3D tensor [9, 13, 17] with packed int8 4H4W block layout (odd sizes) + {/* sizes */ {9, 13, 17}, + /* dtype */ vkapi::kChar, + /* layout */ utils::kPackedInt8_4H4W, + /* expected_dim_order */ {0, 1, 2}, + /* expected_padded_sizes_buffer */ {1, 9, 16, 20}, + /* expected_padded_sizes_texture */ {1, 9, 16, 20}, + /* expected_strides_buffer */ {}, + /* expected_physical_numel_buffer */ 720, + /* expected_physical_numel_texture */ 720, + /* expected_image_extents */ {5, 4, 9}}, + }; - std::vector ref_whcn_dim_order = - create_whcn_dim_order(dim_order); + for (size_t i = 0; i < test_cases.size(); ++i) { + const auto& tc = test_cases[i]; - ASSERT_TRUE(whcn_dim_order == ref_whcn_dim_order); + // Test with buffer storage + { + vTensor tensor_buffer( + context(), + tc.sizes, + tc.dtype, + utils::kBuffer, + tc.layout, + /*allocate_memory = */ false); + + // Verify dtype (kChar -> kInt8x4) + vkapi::ScalarType expected_dtype = tc.dtype; + if (tc.dtype == vkapi::kChar) { + expected_dtype = vkapi::kInt8x4; + } + ASSERT_EQ(tensor_buffer.dtype(), expected_dtype) + << "Test case " << i << ": Buffer dtype mismatch"; + + // Verify dim_order + ASSERT_TRUE(tensor_buffer.dim_order() == tc.expected_dim_order) + << "Test case " << i << ": Buffer dim_order mismatch" + << " (expected size: " << tc.expected_dim_order.size() + << ", actual size: " << tensor_buffer.dim_order().size() << ")"; + + // Verify padded_sizes + ASSERT_TRUE( + tensor_buffer.padded_sizes() == tc.expected_padded_sizes_buffer) + << "Test case " << i << ": Buffer padded_sizes mismatch"; + + // Verify strides (only for float dtype) + if (tc.dtype == vkapi::kFloat && !tc.expected_strides_buffer.empty()) { + ASSERT_TRUE(tensor_buffer.strides() == tc.expected_strides_buffer) + << "Test case " << i << ": Buffer strides mismatch"; + } - // Create new vTensor and check that the strides are correct - vTensor new_v_tensor( - context(), - sizes, - vkapi::kFloat, - utils::kBuffer, - layout, - /*allocate_memory = */ false); + // Verify physical_numel + ASSERT_EQ( + tensor_buffer.physical_numel(), tc.expected_physical_numel_buffer) + << "Test case " << i << ": Buffer physical_numel mismatch"; + } - ASSERT_TRUE(new_v_tensor.strides() == ref_strides); + // Test with texture storage + { + vTensor tensor_texture( + context(), + tc.sizes, + tc.dtype, + utils::kTexture3D, + tc.layout, + /*allocate_memory = */ false); - // Resize vtensor and check that updated metadata is correct - v_tensor_to_resize.virtual_reconfigure(sizes, dim_order); - ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides); + // Verify dtype (kChar -> kInt8x4) + vkapi::ScalarType expected_dtype = tc.dtype; + if (tc.dtype == vkapi::kChar) { + expected_dtype = vkapi::kInt8x4; } + ASSERT_EQ(tensor_texture.dtype(), expected_dtype) + << "Test case " << i << ": Texture dtype mismatch"; + + // Verify dim_order (texture doesn't use dim_order, but it's still + // computed) + ASSERT_TRUE(tensor_texture.dim_order() == tc.expected_dim_order) + << "Test case " << i << ": Texture dim_order mismatch"; + + // Verify padded_sizes + ASSERT_TRUE( + tensor_texture.padded_sizes() == tc.expected_padded_sizes_texture) + << "Test case " << i << ": Texture padded_sizes mismatch"; + + // Verify axis_map + std::vector expected_axis_map = {0, 1, 2, 2}; + ASSERT_TRUE(tensor_texture.axis_map() == expected_axis_map) + << "Test case " << i << ": Texture axis_map mismatch"; + + // Verify physical_numel + ASSERT_EQ( + tensor_texture.physical_numel(), tc.expected_physical_numel_texture) + << "Test case " << i << ": Texture physical_numel mismatch"; + + // Verify image_extents + ASSERT_EQ(tensor_texture.image_extents(), tc.expected_image_extents) + << "Test case " << i << ": Texture image_extents mismatch" + << " (expected: [" << tc.expected_image_extents[0] << ", " + << tc.expected_image_extents[1] << ", " + << tc.expected_image_extents[2] << "], got: [" + << tensor_texture.image_extents()[0] << ", " + << tensor_texture.image_extents()[1] << ", " + << tensor_texture.image_extents()[2] << "])"; } } }