From 8aeb5f46da6fcaa41ca542a7a012bac2fdf839db Mon Sep 17 00:00:00 2001 From: ssjia Date: Fri, 26 Dec 2025 17:40:01 -0800 Subject: [PATCH] [ET-VK][refactor] Introduce PackedDimInfo struct for packed dimension metadata ## Context With the introduction of block-packed memory layouts for quantized tensors, the metadata stored by `vTensor` used to describe the data layout within a texture/buffer was no longer sufficient to form a complete description of the data layout. This created an awkward pattern of needing to estimate the `GPUMemoryLayout` which would be needed to compute storage descriptors such as image extents. This diff addresses the problem by introducing the `PackedDimInfo` struct to `vTensor`, which provides a complete description of how data in a tensor may be organized in the GPU buffer/texture used to store the tensor data, and allows simplification of the functions used to compute buffer numel or texture extents. ## `PackedDimInfo` Introduced the `PackedDimInfo` struct that encapsulates all information about packed dimensions in GPU tensors. This improves code organization and makes the relationship between related metadata fields explicit. The `PackedDimInfo` struct contains: - `packed_dim`: Which dimension is tightly packed (WHCN index) / contiguous in memory - `packed_dim_padded`: Whether the packed dimension is padded to multiple of 4; some layouts will do this to accomodate vectorized load/stores - `outer_packed_dim`: Second-level packing for block-packed layouts (4W4C, 4H4W); for layouts with only a single level of packing, will be equal to `packed_dim` - `outer_packed_dim_padded`: Whether outer packed dim is padded (tiled only) ## Changes - Added PackedDimInfo struct with helper function calculate_packed_dim_info() - Replaced packed_dim_ member with packed_dim_info_ in vTensor class - Updated function signatures to accept PackedDimInfo& instead of packed_dim_: * create_hashed_layout * calculate_dim_order * calculate_padded_sizes * calculate_logical_limits * TextureMetadata constructor/update * vTensorStorage constructor - Added packed_dim_info() accessor to vTensor and ComputeGraph classes - Store an additional `padded_sizes_` member in vTensor which is now used to for strides/image extents/GPU buffer numel computation, instead of using `sizes_` directly - Introduce memory layouts for kInt8x4 type that only use a single level of packing Differential Revision: [D89832382](https://our.internmc.facebook.com/intern/diff/D89832382/) [ghstack-poisoned] --- .../vulkan/runtime/api/containers/Tensor.cpp | 361 ++++++++++-------- .../vulkan/runtime/api/containers/Tensor.h | 86 +++-- backends/vulkan/runtime/graph/ComputeGraph.h | 5 + .../vulkan/runtime/graph/ops/impl/Concat.cpp | 10 +- .../vulkan/runtime/graph/ops/impl/Tan.cpp | 6 +- .../vulkan/runtime/utils/StorageUtils.cpp | 3 + backends/vulkan/runtime/utils/StorageUtils.h | 29 ++ .../vulkan/test/vulkan_compute_api_test.cpp | 12 +- 8 files changed, 323 insertions(+), 189 deletions(-) diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 5a1c445889e..0c027f9c309 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -14,6 +14,38 @@ namespace vkcompute { namespace api { +PackedDimInfo calculate_packed_dim_info( + const utils::GPUMemoryLayout memory_layout, + const utils::StorageType storage_type) { + const int32_t packed_dim = utils::to_packed_dim(memory_layout); + + // Determine if packed dimension is padded + const bool packed_dim_padded = storage_type != utils::kBuffer || + memory_layout == utils::kPackedInt8_4W || + memory_layout == utils::kPackedInt8_4C || + memory_layout == utils::kPackedInt8_4H || + memory_layout == utils::kPackedInt8_4W4C || + memory_layout == utils::kPackedInt8_4H4W; + + // Determine outer packed dimension (for tiled layouts) + int32_t outer_packed_dim; + if (memory_layout == utils::kPackedInt8_4W4C) { + outer_packed_dim = 0; // Width + } else if (memory_layout == utils::kPackedInt8_4H4W) { + outer_packed_dim = 1; // Height + } else { + outer_packed_dim = packed_dim; // No tiled packing + } + + // Determine if outer packed dimension is padded (only for tiled layouts) + const bool outer_packed_dim_padded = + memory_layout == utils::kPackedInt8_4W4C || + memory_layout == utils::kPackedInt8_4H4W; + + return PackedDimInfo( + packed_dim, packed_dim_padded, outer_packed_dim, outer_packed_dim_padded); +} + /* * For PackedInt8 memory layouts, ensure that the scalar type used for the * tensor is kInt8x4. Otherwise, return the original scalar type. @@ -35,24 +67,23 @@ vkapi::ScalarType get_effective_scalar_type( */ std::vector calculate_sizes( const vkapi::VulkanImage& image, - const utils::GPUMemoryLayout memory_layout) { + const PackedDimInfo& packed_dim_info) { auto sizes = std::vector{ image.extents().width, image.extents().height, image.extents().depth}; - const auto packed_dim = utils::to_packed_dim(memory_layout); - sizes.at(packed_dim) *= 4; + sizes.at(packed_dim_info.packed_dim) *= 4; return sizes; } std::vector calculate_dim_order( const size_t ndim, - const int32_t packed_dim) { + const PackedDimInfo& packed_dim_info) { // Special case for zero dim tensors if (ndim == 0) { return {0}; } std::vector dim_order(ndim); // Explicitly convert ndim to signed to prevent underflow - int64_t last_dim = int64_t(ndim) - 1 - packed_dim; + int64_t last_dim = int64_t(ndim) - 1 - packed_dim_info.packed_dim; int64_t cur_dim = 0; for (int d = 0; d < ndim; ++d) { @@ -70,23 +101,28 @@ std::vector calculate_dim_order( } std::vector calculate_strides( - const std::vector& sizes, + const vkapi::ScalarType dtype, + const size_t ndim, + const std::vector& padded_sizes, const std::vector& dim_order) { // For zero dim tensors - if (sizes.size() == 0) { + if (ndim == 0) { return {1}; } - size_t ndim = sizes.size(); std::vector strides(ndim); + // padded_sizes has align_up_4(ndim) dimensions, with padding at the start + // We need to offset when indexing into padded_sizes + const int64_t offset = padded_sizes.size() - ndim; + strides[dim_order[ndim - 1]] = 1; for (int32_t i = ndim - 2; i >= 0; --i) { - if (sizes[dim_order[i + 1]] == 0) { + if (padded_sizes[dim_order[i + 1] + offset] == 0) { strides[dim_order[i]] = strides[dim_order[i + 1]]; } else { strides[dim_order[i]] = - strides[dim_order[i + 1]] * sizes[dim_order[i + 1]]; + strides[dim_order[i + 1]] * padded_sizes[dim_order[i + 1] + offset]; } } @@ -179,7 +215,7 @@ utils::ivec4 flip_and_unsqueeze_ivec4( std::vector calculate_padded_sizes( const std::vector& sizes, - const int32_t packed_dim) { + const PackedDimInfo& packed_dim_info) { int64_t ndim = sizes.size(); if (ndim == 0) { ndim = 1; @@ -192,21 +228,47 @@ std::vector calculate_padded_sizes( padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes); } - // Pad the packed dim to the next multiple of 4. - const int64_t dim_offset = packed_dim + 1; - const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes); - padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size); + // Pad the packed dim to the next multiple of 4 if specified. + // This is required for texture storage and packed layouts. + if (packed_dim_info.packed_dim_padded) { + const int64_t dim_offset = packed_dim_info.packed_dim + 1; + const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes); + padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size); + } + + // For tiled layouts (e.g., 4W4C, 4H4W), also pad the outer packed dimension + // if it's different from the inner packed dimension and is marked as padded. + if (packed_dim_info.outer_packed_dim != packed_dim_info.packed_dim && + packed_dim_info.outer_packed_dim_padded) { + const int64_t outer_dim_offset = packed_dim_info.outer_packed_dim + 1; + const int64_t outer_padded_dim_size = + utils::val_at(-outer_dim_offset, sizes); + padded_sizes.at(ndim_up4 - outer_dim_offset) = + utils::align_up_4(outer_padded_dim_size); + } return padded_sizes; } utils::uvec3 calculate_image_extents( + const vkapi::ScalarType dtype, + const PackedDimInfo& packed_dim_info, const std::vector& padded_sizes, - const utils::GPUMemoryLayout memory_layout, - const std::vector& axis_map, - const int32_t packed_dim) { + const std::vector& axis_map) { utils::uvec3 extents({1, 1, 1}); + const int64_t packed_dim_axis = axis_map.at(packed_dim_info.packed_dim); + const int64_t outer_packed_dim_axis = + axis_map.at(packed_dim_info.outer_packed_dim); + + // If the packed dim is not padded to the next multiple of 4, then that means + // this tensor is using buffer storage and does not require texture extents. + const int64_t packed_dim_idx = + padded_sizes.size() - 1 - packed_dim_info.packed_dim; + if (padded_sizes.at(packed_dim_idx) % 4 != 0) { + return extents; + } + // For high dimensional tensors, buffer storage must be used. No need to // compute image extents in this case. if (padded_sizes.size() > 4) { @@ -222,25 +284,26 @@ utils::uvec3 calculate_image_extents( } // For "regular" tensor dtypes, 4 elements along the packed dim are packed - // into one texel (4-component vectorized type). However, for packed int8 - // memory layouts, an additional level of packing is employed where 4 int8 - // elements are packed into one int32, and then 4 int32 are packed into each - // ivec4 texel. - if (utils::is_packed_int8_layout(memory_layout)) { - // Each int in the ivec4 contains 4 channels. The overall ivec4 contains - // data for a 1Hx4Wx4C block of the input tensor. - if (memory_layout == utils::kPackedInt8_4W4C) { - VK_CHECK_COND(packed_dim == 2); - extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u); + // into one texel (4-component vectorized type). However, for kInt8x4 dtype, + // an additional level of packing is employed where 4 int8 elements are + // packed into one int32, and then 4 int32 are packed into each ivec4 texel. + if (dtype == vkapi::kInt8x4) { + // For layouts with only one packed dimension, loading an ivec4 texel from + // the texture loads 16 int8 values (4 int32 that each contain 4 int8). + if (packed_dim_info.outer_packed_dim == packed_dim_info.packed_dim) { + extents[packed_dim_axis] = utils::div_up(extents[packed_dim_axis], 16u); } - // Each int in the ivec4 contains 4 elements along the width dim. The - // overall ivec4 contains data for a 4Hx4W block of the input tensor. - else if (memory_layout == utils::kPackedInt8_4H4W) { - VK_CHECK_COND(packed_dim == 0); - extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u); - } else { - VK_THROW("Unhandled packed int8 memory layout!"); + // Layouts with two packed dimension (e.g., 4W4C, 4H4W) load a 4x4 block of + // data from two dimensions with each ivec4 texel load, as opposed to 16 + // adjacent values from a single dimension. + else { + VK_CHECK_COND(extents[outer_packed_dim_axis] % 4 == 0); + extents[outer_packed_dim_axis] /= 4; + VK_CHECK_COND(extents[packed_dim_axis] % 4 == 0); + extents[packed_dim_axis] /= 4; } + } else { + extents[packed_dim_axis] /= 4; } // axis_map[3] indicates the WHCN index of the dimension used for batch @@ -251,9 +314,6 @@ utils::uvec3 calculate_image_extents( // Multiply the extents of the batch axis by the batch size. extents[batch_axis] *= padded_sizes.at(0); - VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0); - extents[axis_map.at(packed_dim)] /= 4; - return extents; } @@ -285,73 +345,43 @@ utils::uvec3 calculate_logical_limits( * directly from tensor sizes. */ utils::uvec3 calculate_logical_limits( - const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout, - const std::vector& axis_map, - const int32_t packed_dim) { + const vkapi::ScalarType dtype, + const PackedDimInfo& packed_dim_info, + const std::vector& padded_sizes, + const std::vector& axis_map) { return calculate_logical_limits( - calculate_image_extents( - calculate_padded_sizes(sizes, packed_dim), - memory_layout, - axis_map, - packed_dim), + calculate_image_extents(dtype, packed_dim_info, padded_sizes, axis_map), axis_map); } +/* + * Calculate the number of elements that a GPU buffer would require to store the + * contents of a tensor. + */ int64_t calculate_gpu_buffer_numel( - const std::vector& sizes, - const utils::GPUMemoryLayout memory_layout, - const vkapi::ScalarType dtype) { + const vkapi::ScalarType dtype, + const PackedDimInfo& packed_dim_info, + const std::vector& padded_sizes) { size_t numel; - // Mirrors the logic in calculate_image_extents for packed int8 memory layouts + numel = utils::multiply_integers(padded_sizes); + + // For this dtype, the data buffer is interpreted as an array of int32, where + // each int32 contains 4xint8 values. To account for this, the number of + // elements needs to be divided by 4. if (dtype == vkapi::kInt8x4) { - VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout)); - std::vector blocks_in_dim = - flip_and_unsqueeze(sizes, kTensorSizes, 0); - // Each ivec4 contains data for a 1Hx4Wx4C block of the input - if (memory_layout == utils::kPackedInt8_4W4C) { - blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]); - blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]); - } - // Each ivec4 contains data for a 4Hx4W block of the input - else if (memory_layout == utils::kPackedInt8_4H4W) { - blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]); - blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]); - } else { - VK_THROW("Unhandled packed int8 memory layout!"); - } - // Each block is represented as an ivec4, and the base dtype of the buffer - // is int. Therefore, need to multiply the number of blocks by 4 to obtain - // the number of int elements in the data buffer. - numel = utils::multiply_integers(blocks_in_dim) * 4; - } - // Case for "regular" dtypes/memory layouts - else { - numel = utils::multiply_integers(sizes); - - // For 8-bit types, align to the next multiple of 4. For devices that do not - // support 8-bit storage buffers, the tensor data will be interpreted as an - // array of int32 instead. - if (vkapi::element_size(dtype) == 1) { - numel = utils::align_up_4(numel); - } + // Should already be a multiple of 4 due to padding the packed dimensions + VK_CHECK_COND(numel % 4 == 0); + numel /= 4; } - return numel; -} -int64_t calculate_staging_or_gpu_buffer_numel( - Context* const context, - const std::vector& sizes, - const utils::uvec3 image_extents, - const utils::StorageType storage_type, - const utils::GPUMemoryLayout memory_layout, - const vkapi::ScalarType dtype) { - // For texture backed tensors, simply multiply the total number of texels by 4 - if (storage_type != utils::kBuffer) { - return image_extents[0] * image_extents[1] * image_extents[2] * 4; + // For 8-bit types, align to the next multiple of 4. For devices that do not + // support 8-bit storage buffers, the tensor data will be interpreted as an + // array of int32 instead. + if (vkapi::element_size(dtype) == 1) { + numel = utils::align_up_4(numel); } - return calculate_gpu_buffer_numel(sizes, memory_layout, dtype); + return numel; } template ::value>> @@ -365,13 +395,13 @@ int32_t pack_into_int32(const std::vector& vec, const int32_t extra) { int32_t create_hashed_layout( const std::vector& dim_order, const std::vector& axis_map, - const int32_t packed_dim, + const PackedDimInfo& packed_dim_info, const utils::StorageType storage_type) { if (storage_type == utils::kBuffer) { return pack_into_int32( flip_and_unsqueeze(dim_order, kTensorDimOrder, 0), 0); } - return pack_into_int32(axis_map, packed_dim); + return pack_into_int32(axis_map, packed_dim_info.packed_dim); } size_t calculate_max_ubo_nbytes( @@ -500,24 +530,19 @@ vTensorStorage::vTensorStorage( const utils::StorageType storage_type, const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, - const int32_t packed_dim, - const std::vector& sizes, + const PackedDimInfo& packed_dim_info, + const std::vector& padded_sizes, const vkapi::ScalarType dtype, const bool allocate_memory) : context_(context), storage_type_{storage_type}, image_extents_(calculate_image_extents( - calculate_padded_sizes(sizes, packed_dim), - memory_layout, - axis_map, - packed_dim)), - buffer_length_{calculate_staging_or_gpu_buffer_numel( - context_, - sizes, - image_extents_, - storage_type, - memory_layout, - dtype)}, + dtype, + packed_dim_info, + padded_sizes, + axis_map)), + buffer_length_{ + calculate_gpu_buffer_numel(dtype, packed_dim_info, padded_sizes)}, buffer_offset_{0}, image_(allocate_image( context_, @@ -634,18 +659,20 @@ vTensor::vTensor( const utils::GPUMemoryLayout memory_layout, const bool allocate_memory, const utils::AxisMapLayout axis_map_layout) - : dtype_(get_effective_scalar_type(dtype, memory_layout)), + : packed_dim_info_(calculate_packed_dim_info(memory_layout, storage_type)), + dtype_(get_effective_scalar_type(dtype, memory_layout)), // Calculate tensor metadata sizes_(sizes.begin(), sizes.end()), - packed_dim_(utils::to_packed_dim(memory_layout)), - dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)), + padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)), + dim_order_(calculate_dim_order(sizes_.size(), packed_dim_info_)), axis_map_(calculate_axis_map(sizes_, axis_map_layout)), - strides_(calculate_strides(sizes, dim_order_)), + strides_( + calculate_strides(dtype_, sizes.size(), padded_sizes_, dim_order_)), numel_(utils::multiply_integers(sizes_)), hashed_layout_(create_hashed_layout( dim_order_, axis_map_, - packed_dim_, + packed_dim_info_, storage_type)), // Related to tensor metadata UBOs min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, @@ -659,8 +686,8 @@ vTensor::vTensor( storage_type, memory_layout, axis_map_, - packed_dim_, - sizes, + packed_dim_info_, + padded_sizes_, dtype_, allocate_memory)) { // uniform_data_ only valid for low dim tensors @@ -683,10 +710,12 @@ vTensor::vTensor( const vkapi::VulkanImage& image, const utils::GPUMemoryLayout memory_layout, const utils::AxisMapLayout axis_map_layout) - : dtype_(vkapi::element_scalartype(image.format())), + : packed_dim_info_( + calculate_packed_dim_info(memory_layout, utils::kTexture3D)), + dtype_(vkapi::element_scalartype(image.format())), // Calculate tensor metadata - sizes_(calculate_sizes(image, memory_layout)), - packed_dim_(utils::to_packed_dim(memory_layout)), + sizes_(calculate_sizes(image, packed_dim_info_)), + padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)), dim_order_(), axis_map_(calculate_axis_map(sizes_, axis_map_layout)), strides_(), @@ -694,7 +723,7 @@ vTensor::vTensor( hashed_layout_(create_hashed_layout( dim_order_, axis_map_, - packed_dim_, + packed_dim_info_, utils::kTexture3D)), // Related to tensor metadata UBOs min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()}, @@ -713,10 +742,11 @@ vTensor::vTensor( } vTensor::vTensor(vTensor& other) - : dtype_(other.dtype_), + : packed_dim_info_{other.packed_dim_info_}, + dtype_(other.dtype_), // Copy tensor size metadata sizes_(other.sizes_.begin(), other.sizes_.end()), - packed_dim_{other.packed_dim_}, + padded_sizes_(other.padded_sizes_.begin(), other.padded_sizes_.end()), dim_order_(other.dim_order_.begin(), other.dim_order_.end()), axis_map_(other.axis_map_.begin(), other.axis_map_.end()), strides_(other.strides_.begin(), other.strides_.end()), @@ -735,18 +765,20 @@ vTensor::vTensor( vTensor& other, const std::vector& sizes, const std::vector& dim_order) - : dtype_(other.dtype_), + : packed_dim_info_(other.packed_dim_info_), + dtype_(other.dtype_), // Copy tensor size metadata sizes_(sizes.begin(), sizes.end()), - packed_dim_(other.packed_dim_), + padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)), dim_order_(dim_order.begin(), dim_order.end()), axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)), - strides_(calculate_strides(sizes_, dim_order_)), + strides_( + calculate_strides(dtype_, sizes_.size(), padded_sizes_, dim_order_)), numel_(other.numel_), hashed_layout_(create_hashed_layout( dim_order_, axis_map_, - packed_dim_, + packed_dim_info_, other.storage_type())), min_nbytes_per_ubo_{other.min_nbytes_per_ubo_}, max_ubo_nbytes_{other.max_ubo_nbytes_}, @@ -840,15 +872,15 @@ vTensor::TextureMetadata::TextureMetadata( const std::vector& src_sizes, const TextureLimits& src_logical_limits, const std::vector& src_axis_map, - const int32_t src_packed_dim) { - update(src_sizes, src_logical_limits, src_axis_map, src_packed_dim); + const PackedDimInfo& src_packed_dim_info) { + update(src_sizes, src_logical_limits, src_axis_map, src_packed_dim_info); } void vTensor::TextureMetadata::update( const std::vector& src_sizes, const TextureLimits& src_logical_limits, const std::vector& src_axis_map, - const int32_t src_packed_dim) { + const PackedDimInfo& src_packed_dim_info) { // Convert sizes to flipped and unsqueezed format (fixed to 4 dimensions for // texture) std::vector fu_sizes = @@ -877,7 +909,7 @@ void vTensor::TextureMetadata::update( axis_map[i] = 0; } - packed_dim = src_packed_dim; + packed_dim = src_packed_dim_info.packed_dim; } vkapi::VulkanImage& vTensor::image( @@ -911,17 +943,36 @@ vkapi::VulkanBuffer& vTensor::buffer( } utils::GPUMemoryLayout vTensor::estimate_memory_layout() const { + // Check for tiled layouts (two-level packing) - only applicable for kInt8x4 + if (dtype_ == vkapi::kInt8x4 && + packed_dim_info_.outer_packed_dim != packed_dim_info_.packed_dim) { + // For 4W4C: packed_dim = Channels, outer_packed_dim = Width + if (packed_dim_info_.packed_dim == WHCN::kChannelsDim && + packed_dim_info_.outer_packed_dim == WHCN::kWidthDim) { + return utils::kPackedInt8_4W4C; + } + // For 4H4W: packed_dim = Width, outer_packed_dim = Height + if (packed_dim_info_.packed_dim == WHCN::kWidthDim && + packed_dim_info_.outer_packed_dim == WHCN::kHeightDim) { + return utils::kPackedInt8_4H4W; + } + VK_THROW("Invalid tiled layout configuration for kInt8x4 dtype"); + } + + // Single-level packing layouts if (dtype_ == vkapi::kInt8x4) { - switch (packed_dim_) { + switch (packed_dim_info_.packed_dim) { case WHCN::kChannelsDim: - return utils::kPackedInt8_4W4C; + return utils::kPackedInt8_4C; case WHCN::kWidthDim: - return utils::kPackedInt8_4H4W; + return utils::kPackedInt8_4W; + case WHCN::kHeightDim: + return utils::kPackedInt8_4H; default: VK_THROW("Invalid packed dim for Tensor with kInt8x4 type"); } } - switch (packed_dim_) { + switch (packed_dim_info_.packed_dim) { case WHCN::kWidthDim: return utils::kWidthPacked; case WHCN::kHeightDim: @@ -996,7 +1047,7 @@ const vkapi::BufferBindInfo vTensor::texture_meta_ubo() { size_t ubo_nbytes = sizeof(TextureMetadata); if (!texture_meta_.buffer()) { TextureLimits limits(logical_limits()); - TextureMetadata data(sizes_, limits, axis_map_, packed_dim_); + TextureMetadata data(sizes_, limits, axis_map_, packed_dim_info_); texture_meta_ = ParamsBuffer(storage_->context_, data); } return vkapi::BufferBindInfo(texture_meta_.buffer(), 0, ubo_nbytes); @@ -1049,7 +1100,8 @@ void vTensor::acquire_allocation(vkapi::Allocation&& allocation) { void vTensor::update_metadata() { numel_ = utils::multiply_integers(sizes_); - strides_ = calculate_strides(sizes_, dim_order_); + strides_ = + calculate_strides(dtype_, sizes_.size(), padded_sizes_, dim_order_); // Update uniform data if it has been modified if (sizes_.size() <= 4) { @@ -1061,7 +1113,7 @@ void vTensor::update_metadata() { uniform_data_->strides_v = flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_); uniform_data_->logical_limits.limits = calculate_logical_limits( - sizes_, estimate_memory_layout(), axis_map_, packed_dim_); + dtype_, packed_dim_info_, padded_sizes_, axis_map_); if (sizes_uniform_offset_ != kUniformOffsetUnset) { uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_); @@ -1088,21 +1140,17 @@ void vTensor::update_metadata() { if (texture_meta_.buffer()) { TextureMetadata data( - sizes_, uniform_data_->logical_limits, axis_map_, packed_dim_); + sizes_, uniform_data_->logical_limits, axis_map_, packed_dim_info_); texture_meta_.update(data); } } void vTensor::check_sizes(const std::vector& sizes) const { - utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout(); if (storage_type() != utils::kBuffer) { // For texture storage check that the current texture is large enough for // the new sizes of the tensor. utils::uvec3 virtual_extents = calculate_image_extents( - calculate_padded_sizes(sizes_, packed_dim_), - est_memory_layout, - axis_map_, - packed_dim_); + dtype_, packed_dim_info_, padded_sizes_, axis_map_); bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0]; valid_resize = @@ -1117,7 +1165,7 @@ void vTensor::check_sizes(const std::vector& sizes) const { // For buffer storage check that the current buffer is large enough for // the new sizes of the tensor. int64_t numel = - calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_); + calculate_gpu_buffer_numel(dtype_, packed_dim_info_, padded_sizes_); bool valid_resize = numel + storage_->buffer_offset_ <= storage_->buffer_length_; VK_CHECK_COND( @@ -1137,11 +1185,12 @@ void vTensor::virtual_reconfigure( check_sizes(new_sizes); sizes_ = new_sizes; + padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_); dim_order_ = new_dim_order; // Update the hashed layout because dim order is updated - hashed_layout_ = - create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); + hashed_layout_ = create_hashed_layout( + dim_order_, axis_map_, packed_dim_info_, storage_type()); update_metadata(); } @@ -1149,9 +1198,10 @@ void vTensor::virtual_reconfigure( void vTensor::virtual_clone(const vTensor& other) { VK_CHECK_COND(is_view_of(other)); sizes_ = other.sizes_; + padded_sizes_ = other.padded_sizes_; dim_order_ = other.dim_order_; axis_map_ = other.axis_map_; - packed_dim_ = other.packed_dim_; + packed_dim_info_ = other.packed_dim_info_; hashed_layout_ = other.hashed_layout_; *uniform_data_ = *other.get_uniform_data(); @@ -1164,6 +1214,7 @@ void vTensor::virtual_resize(const std::vector& new_sizes) { check_sizes(new_sizes); sizes_ = new_sizes; + padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_); update_metadata(); } @@ -1191,10 +1242,10 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) { const int dim0_whcn = sizes_.size() - 1 - dim0; const int dim1_whcn = sizes_.size() - 1 - dim1; - if (packed_dim_ == dim0_whcn) { - packed_dim_ = dim1_whcn; - } else if (packed_dim_ == dim1_whcn) { - packed_dim_ = dim0_whcn; + if (packed_dim_info_.packed_dim == dim0_whcn) { + packed_dim_info_.packed_dim = dim1_whcn; + } else if (packed_dim_info_.packed_dim == dim1_whcn) { + packed_dim_info_.packed_dim = dim0_whcn; } if (storage_type() == utils::kBuffer) { @@ -1213,8 +1264,12 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) { } // Update the hashed layout because dim order / axis mpa is updated - hashed_layout_ = - create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type()); + hashed_layout_ = create_hashed_layout( + dim_order_, axis_map_, packed_dim_info_, storage_type()); + + // Recalculate padded_sizes_ based on the new sizes and updated + // packed_dim_info + padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_); update_metadata(); } diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 967148b8dbe..9ccff1b38b8 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -21,6 +21,43 @@ namespace api { static constexpr size_t kTensorDimLimit = 8; +/* + * PackedDimInfo encapsulates metadata about packed dimensions in GPU tensors. + * This includes information about which dimension is packed, whether it's + * padded, and tiled packing information for special layouts like 4W4C and 4H4W. + */ +struct PackedDimInfo { + // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for + // width, 1 for height, etc.). For texture backed tensors, this describes + // which dimension is packed along a texel. For buffer backed tensors, this + // describes which dimension has a stride of 1 (i.e. is last in the dim + // order). + int32_t packed_dim; + // Describes if the packed dimension is padded to a multiple of 4. This will + // be true for all tensors that use texture storage, and will also be true + // for the PACKED_PADDED memory layouts. + bool packed_dim_padded; + // Describes a second level of packing, if applicable (which will only apply + // to the 4W4C and 4H4W layouts). If there is no second level of packing, + // then this will be equal to packed_dim. Otherwise, it will represent the + // outer dim used to construct tiled packing. For example, 4W4C will have + // packed_dim = 2 and outer_packed_dim = 0. + int32_t outer_packed_dim; + // Whether the outer packed dim is padded to the next multiple of 4. This is + // true only for tiled layouts. + bool outer_packed_dim_padded; + + PackedDimInfo( + int32_t dim, + bool dim_padded, + int32_t outer_dim, + bool outer_dim_padded) + : packed_dim(dim), + packed_dim_padded(dim_padded), + outer_packed_dim(outer_dim), + outer_packed_dim_padded(outer_dim_padded) {} +}; + /* * Given a GPUMemoryLayout value, produce a dim order vector that matches the * given memory layout. The produced dim order vector will be in the NCHW @@ -28,14 +65,16 @@ static constexpr size_t kTensorDimLimit = 8; */ std::vector calculate_dim_order( const size_t ndim, - const int32_t packed_dim); + const PackedDimInfo& packed_dim_info); /* * Given the sizes of a tensor and the dim order of the tensor (both in NCHW) * dimension order, calculate the strides of the tensor. */ std::vector calculate_strides( - const std::vector& sizes, + const vkapi::ScalarType dtype, + const size_t ndim, + const std::vector& padded_sizes, const std::vector& dim_order); /* @@ -55,15 +94,16 @@ std::vector calculate_strides( */ std::vector calculate_padded_sizes( const std::vector& sizes, - const int32_t packed_dim); + const PackedDimInfo& packed_dim_info); /* * Calculate the image extents required of a texture backed tensor. */ utils::uvec3 calculate_image_extents( + const vkapi::ScalarType dtype, + const PackedDimInfo& packed_dim_info, const std::vector& padded_sizes, - const std::vector& axis_map, - const int32_t packed_dim); + const std::vector& axis_map); struct LastAccess { vkapi::PipelineStageFlags stage; @@ -79,18 +119,6 @@ struct LastAccess { : stage{stage_flags}, access{access_flags} {} }; -/* - * Calculate the number of elements that a GPU buffer would require to store the - * contents of a tensor. This will depend on the storage type and dtype of the - * tensor, as well as the features available on the device. - */ -int64_t calculate_gpu_buffer_numel( - Context* const context, - const std::vector& sizes, - const utils::uvec3 image_extents, - const utils::StorageType storage_type, - const vkapi::ScalarType dtype); - class vTensorStorage final { public: // Do not allow empty vTensorStorage construction @@ -101,8 +129,8 @@ class vTensorStorage final { const utils::StorageType storage_type, const utils::GPUMemoryLayout memory_layout, const std::vector& axis_map, - const int32_t packed_dim, - const std::vector& sizes, + const PackedDimInfo& packed_dim_info, + const std::vector& padded_sizes, const vkapi::ScalarType dtype, const bool allocate_memory = true); @@ -295,13 +323,13 @@ class vTensor final { const std::vector& sizes, const TextureLimits& logical_limits, const std::vector& axis_map, - const int32_t packed_dim); + const PackedDimInfo& packed_dim_info); void update( const std::vector& sizes, const TextureLimits& logical_limits, const std::vector& axis_map, - const int32_t packed_dim); + const PackedDimInfo& packed_dim_info); }; private: @@ -310,16 +338,14 @@ class vTensor final { * to construct a tensor. */ + // Information about packed dimension padding and tiled packing + PackedDimInfo packed_dim_info_; // Whether the tensor has elements of type float, int, etc. vkapi::ScalarType dtype_; // sizes of the tensor in NCHW dimension order std::vector sizes_; - // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for - // width, 1 for height, etc.). For texture backed tensors, this describes - // which dimension is packed along a texel. For buffer backed tensors, this - // describes which dimension has a stride of 1 (i.e. is last in the dim - // order). - int32_t packed_dim_; + // padded sizes of the tensor (pre-computed to avoid recalculation) + std::vector padded_sizes_; /* * "Layout" metadata. These describe with further detail how tensor data is @@ -483,7 +509,11 @@ class vTensor final { utils::GPUMemoryLayout estimate_memory_layout() const; inline int32_t packed_dim() const { - return packed_dim_; + return packed_dim_info_.packed_dim; + } + + inline const PackedDimInfo& packed_dim_info() const { + return packed_dim_info_; } /* diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 18e97d7b516..a7a2a09f1fc 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -442,6 +442,11 @@ class ComputeGraph final { return values_.at(idx).toConstTensor().packed_dim(); } + inline const api::PackedDimInfo& packed_dim_info_of( + const ValueRef idx) const { + return values_.at(idx).toConstTensor().packed_dim_info(); + } + inline int32_t concat_dim_of(const ValueRef idx) const { return values_.at(idx).toConstTensor().concat_dim(); } diff --git a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp index 0a4acb6cef3..1bc256ef6ef 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp @@ -108,17 +108,19 @@ utils::uvec3 concat_pick_global_wg_size( // Calculate what the image extents would be of a tensor with the input // volume's sizes. This produces the number of texels that would need to be // written to. - const int32_t packed_dim = graph->packed_dim_of(out); + const api::PackedDimInfo& packed_dim_info = graph->packed_dim_info_of(out); std::vector inp_volume_texel_sizes = - api::calculate_padded_sizes(inp_volume_sizes, packed_dim); + api::calculate_padded_sizes(inp_volume_sizes, packed_dim_info); // If the concat_dim is the same as the packed dim, and the concat_offset for // this input batch is not a multiple of 4, then the data from an input texel // may be split up between two output texels. For example: // I0 , I1 , I2 , I2 // O0 , O1 , O2 , X | X , X , X , X // Therefore, 1 texel is added to the packed dim to account for this. - inp_volume_texel_sizes.at(3 - packed_dim) = - utils::div_up_4(inp_volume_texel_sizes.at(3 - packed_dim)) + 1; + inp_volume_texel_sizes.at(3 - packed_dim_info.packed_dim) = + utils::div_up_4( + inp_volume_texel_sizes.at(3 - packed_dim_info.packed_dim)) + + 1; const uint32_t inp_volume_texel_numel = utils::multiply_integers(inp_volume_texel_sizes); diff --git a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp index 687b3923354..223f082d6a6 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp @@ -34,7 +34,11 @@ void add_tan_node(ComputeGraph& graph, const ValueRef in, const ValueRef out) { add_storage_type_suffix(kernel_name, graph.storage_type_of(out)); vkapi::ParamsBindList ubos({}); - ubos.append({graph.logical_limits_ubo(out)}); + if (graph.storage_type_of(out) == utils::kBuffer) { + ubos.append({graph.numel_ubo(out)}); + } else { + ubos.append({graph.logical_limits_ubo(out)}); + } graph.execute_nodes().emplace_back(new DynamicDispatchNode( graph, diff --git a/backends/vulkan/runtime/utils/StorageUtils.cpp b/backends/vulkan/runtime/utils/StorageUtils.cpp index cfe3d9e159a..767c1294c39 100644 --- a/backends/vulkan/runtime/utils/StorageUtils.cpp +++ b/backends/vulkan/runtime/utils/StorageUtils.cpp @@ -13,6 +13,9 @@ namespace utils { bool is_packed_int8_layout(const GPUMemoryLayout layout) { switch (layout) { + case kPackedInt8_4W: + case kPackedInt8_4C: + case kPackedInt8_4H: case kPackedInt8_4W4C: case kPackedInt8_4H4W: return true; diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h index a269adccecb..6ae9fab768a 100644 --- a/backends/vulkan/runtime/utils/StorageUtils.h +++ b/backends/vulkan/runtime/utils/StorageUtils.h @@ -101,6 +101,11 @@ enum class GPUMemoryLayout : uint8_t { * 16 element block is loaded, rather than 4 elements along one dimension. */ + // Single-dimension packed layouts (with padding) + TENSOR_PACKED_INT8_4W = 5u, + TENSOR_PACKED_INT8_4C = 6u, + TENSOR_PACKED_INT8_4H = 7u, + TENSOR_PACKED_INT8_4W4C = 3u, TENSOR_PACKED_INT8_4H4W = 4u, }; @@ -114,6 +119,15 @@ static constexpr GPUMemoryLayout kHeightPacked = static constexpr GPUMemoryLayout kChannelsPacked = GPUMemoryLayout::TENSOR_CHANNELS_PACKED; +static constexpr GPUMemoryLayout kPackedInt8_4W = + GPUMemoryLayout::TENSOR_PACKED_INT8_4W; + +static constexpr GPUMemoryLayout kPackedInt8_4C = + GPUMemoryLayout::TENSOR_PACKED_INT8_4C; + +static constexpr GPUMemoryLayout kPackedInt8_4H = + GPUMemoryLayout::TENSOR_PACKED_INT8_4H; + static constexpr GPUMemoryLayout kPackedInt8_4W4C = GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C; @@ -129,6 +143,12 @@ T to_packed_dim(const GPUMemoryLayout layout) { return 1; case kChannelsPacked: return 2; + case kPackedInt8_4W: + return 0; + case kPackedInt8_4C: + return 2; + case kPackedInt8_4H: + return 1; case kPackedInt8_4W4C: return 2; case kPackedInt8_4H4W: @@ -170,6 +190,15 @@ inline std::ostream& operator<<( case kChannelsPacked: os << "TENSOR_CHANNELS_PACKED"; break; + case kPackedInt8_4W: + os << "TENSOR_PACKED_INT8_4W"; + break; + case kPackedInt8_4C: + os << "TENSOR_PACKED_INT8_4C"; + break; + case kPackedInt8_4H: + os << "TENSOR_PACKED_INT8_4H"; + break; case kPackedInt8_4W4C: os << "TENSOR_PACKED_INT8_4W4C"; break; diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 03619ec54af..e33ce1280e8 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -271,7 +271,8 @@ TEST_F(VulkanComputeAPITest, calculate_dim_order_test) { const size_t& ndim = std::get<0>(test_case); const int32_t packed_dim = std::get<1>(test_case); const auto& expected_dim_order = std::get<2>(test_case); - std::vector dim_order = calculate_dim_order(ndim, packed_dim); + api::PackedDimInfo packed_dim_info(packed_dim, false, packed_dim, false); + std::vector dim_order = calculate_dim_order(ndim, packed_dim_info); ASSERT_TRUE(dim_order == expected_dim_order); } @@ -294,9 +295,14 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) { {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) { { const int32_t packed_dim = static_cast(layout); + api::PackedDimInfo packed_dim_info( + packed_dim, false, packed_dim, false); std::vector dim_order = - calculate_dim_order(sizes.size(), packed_dim); - std::vector strides = calculate_strides(sizes, dim_order); + calculate_dim_order(sizes.size(), packed_dim_info); + std::vector padded_sizes = + calculate_padded_sizes(sizes, packed_dim_info); + std::vector strides = calculate_strides( + vkapi::kFloat, sizes.size(), padded_sizes, dim_order); int64_t numel = utils::multiply_integers(sizes); std::vector ref_strides = get_reference_strides(sizes, layout);