From 8aeb5f46da6fcaa41ca542a7a012bac2fdf839db Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Fri, 26 Dec 2025 17:40:01 -0800
Subject: [PATCH] [ET-VK][refactor] Introduce PackedDimInfo struct for packed
 dimension metadata

## Context

With the introduction of block-packed memory layouts for quantized tensors, the metadata stored by `vTensor` used to describe the data layout within a texture/buffer was no longer sufficient to form a complete description of the data layout. This created an awkward pattern of needing to estimate the `GPUMemoryLayout` which would be needed to compute storage descriptors such as image extents.

This diff addresses the problem by introducing the `PackedDimInfo` struct to `vTensor`, which provides a complete description of how data in a tensor may be organized in the GPU buffer/texture used to store the tensor data, and allows simplification of the functions used to compute buffer numel or texture extents.

## `PackedDimInfo`

Introduced the `PackedDimInfo` struct that encapsulates all information about packed dimensions in GPU tensors. This improves code organization and makes the relationship between related metadata fields explicit.

The `PackedDimInfo` struct contains:
- `packed_dim`: Which dimension is tightly packed (WHCN index) / contiguous in memory
- `packed_dim_padded`: Whether the packed dimension is padded to multiple of 4; some layouts will do this to accomodate vectorized load/stores
- `outer_packed_dim`: Second-level packing for block-packed layouts (4W4C, 4H4W); for layouts with only a single level of packing, will be equal to `packed_dim`
- `outer_packed_dim_padded`: Whether outer packed dim is padded (tiled only)

## Changes

- Added PackedDimInfo struct with helper function calculate_packed_dim_info()
- Replaced packed_dim_ member with packed_dim_info_ in vTensor class
- Updated function signatures to accept PackedDimInfo& instead of packed_dim_:
  * create_hashed_layout
  * calculate_dim_order
  * calculate_padded_sizes
  * calculate_logical_limits
  * TextureMetadata constructor/update
  * vTensorStorage constructor
- Added packed_dim_info() accessor to vTensor and ComputeGraph classes
- Store an additional `padded_sizes_` member in vTensor which is now used to for strides/image extents/GPU buffer numel computation, instead of using `sizes_` directly
- Introduce memory layouts for kInt8x4 type that only use a single level of packing

Differential Revision: [D89832382](https://our.internmc.facebook.com/intern/diff/D89832382/)

[ghstack-poisoned]
---
 .../vulkan/runtime/api/containers/Tensor.cpp  | 361 ++++++++++--------
 .../vulkan/runtime/api/containers/Tensor.h    |  86 +++--
 backends/vulkan/runtime/graph/ComputeGraph.h  |   5 +
 .../vulkan/runtime/graph/ops/impl/Concat.cpp  |  10 +-
 .../vulkan/runtime/graph/ops/impl/Tan.cpp     |   6 +-
 .../vulkan/runtime/utils/StorageUtils.cpp     |   3 +
 backends/vulkan/runtime/utils/StorageUtils.h  |  29 ++
 .../vulkan/test/vulkan_compute_api_test.cpp   |  12 +-
 8 files changed, 323 insertions(+), 189 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 5a1c445889e..0c027f9c309 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -14,6 +14,38 @@
 namespace vkcompute {
 namespace api {
 
+PackedDimInfo calculate_packed_dim_info(
+    const utils::GPUMemoryLayout memory_layout,
+    const utils::StorageType storage_type) {
+  const int32_t packed_dim = utils::to_packed_dim<int32_t>(memory_layout);
+
+  // Determine if packed dimension is padded
+  const bool packed_dim_padded = storage_type != utils::kBuffer ||
+      memory_layout == utils::kPackedInt8_4W ||
+      memory_layout == utils::kPackedInt8_4C ||
+      memory_layout == utils::kPackedInt8_4H ||
+      memory_layout == utils::kPackedInt8_4W4C ||
+      memory_layout == utils::kPackedInt8_4H4W;
+
+  // Determine outer packed dimension (for tiled layouts)
+  int32_t outer_packed_dim;
+  if (memory_layout == utils::kPackedInt8_4W4C) {
+    outer_packed_dim = 0; // Width
+  } else if (memory_layout == utils::kPackedInt8_4H4W) {
+    outer_packed_dim = 1; // Height
+  } else {
+    outer_packed_dim = packed_dim; // No tiled packing
+  }
+
+  // Determine if outer packed dimension is padded (only for tiled layouts)
+  const bool outer_packed_dim_padded =
+      memory_layout == utils::kPackedInt8_4W4C ||
+      memory_layout == utils::kPackedInt8_4H4W;
+
+  return PackedDimInfo(
+      packed_dim, packed_dim_padded, outer_packed_dim, outer_packed_dim_padded);
+}
+
 /*
  * For PackedInt8 memory layouts, ensure that the scalar type used for the
  * tensor is kInt8x4. Otherwise, return the original scalar type.
@@ -35,24 +67,23 @@ vkapi::ScalarType get_effective_scalar_type(
  */
 std::vector<int64_t> calculate_sizes(
     const vkapi::VulkanImage& image,
-    const utils::GPUMemoryLayout memory_layout) {
+    const PackedDimInfo& packed_dim_info) {
   auto sizes = std::vector<int64_t>{
       image.extents().width, image.extents().height, image.extents().depth};
-  const auto packed_dim = utils::to_packed_dim<int32_t>(memory_layout);
-  sizes.at(packed_dim) *= 4;
+  sizes.at(packed_dim_info.packed_dim) *= 4;
   return sizes;
 }
 
 std::vector<int64_t> calculate_dim_order(
     const size_t ndim,
-    const int32_t packed_dim) {
+    const PackedDimInfo& packed_dim_info) {
   // Special case for zero dim tensors
   if (ndim == 0) {
     return {0};
   }
   std::vector<int64_t> dim_order(ndim);
   // Explicitly convert ndim to signed to prevent underflow
-  int64_t last_dim = int64_t(ndim) - 1 - packed_dim;
+  int64_t last_dim = int64_t(ndim) - 1 - packed_dim_info.packed_dim;
 
   int64_t cur_dim = 0;
   for (int d = 0; d < ndim; ++d) {
@@ -70,23 +101,28 @@ std::vector<int64_t> calculate_dim_order(
 }
 
 std::vector<int64_t> calculate_strides(
-    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype,
+    const size_t ndim,
+    const std::vector<int64_t>& padded_sizes,
     const std::vector<int64_t>& dim_order) {
   // For zero dim tensors
-  if (sizes.size() == 0) {
+  if (ndim == 0) {
     return {1};
   }
 
-  size_t ndim = sizes.size();
   std::vector<int64_t> strides(ndim);
 
+  // padded_sizes has align_up_4(ndim) dimensions, with padding at the start
+  // We need to offset when indexing into padded_sizes
+  const int64_t offset = padded_sizes.size() - ndim;
+
   strides[dim_order[ndim - 1]] = 1;
   for (int32_t i = ndim - 2; i >= 0; --i) {
-    if (sizes[dim_order[i + 1]] == 0) {
+    if (padded_sizes[dim_order[i + 1] + offset] == 0) {
       strides[dim_order[i]] = strides[dim_order[i + 1]];
     } else {
       strides[dim_order[i]] =
-          strides[dim_order[i + 1]] * sizes[dim_order[i + 1]];
+          strides[dim_order[i + 1]] * padded_sizes[dim_order[i + 1] + offset];
     }
   }
 
@@ -179,7 +215,7 @@ utils::ivec4 flip_and_unsqueeze_ivec4(
 
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
-    const int32_t packed_dim) {
+    const PackedDimInfo& packed_dim_info) {
   int64_t ndim = sizes.size();
   if (ndim == 0) {
     ndim = 1;
@@ -192,21 +228,47 @@ std::vector<int64_t> calculate_padded_sizes(
     padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes);
   }
 
-  // Pad the packed dim to the next multiple of 4.
-  const int64_t dim_offset = packed_dim + 1;
-  const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
-  padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
+  // Pad the packed dim to the next multiple of 4 if specified.
+  // This is required for texture storage and packed layouts.
+  if (packed_dim_info.packed_dim_padded) {
+    const int64_t dim_offset = packed_dim_info.packed_dim + 1;
+    const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
+    padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
+  }
+
+  // For tiled layouts (e.g., 4W4C, 4H4W), also pad the outer packed dimension
+  // if it's different from the inner packed dimension and is marked as padded.
+  if (packed_dim_info.outer_packed_dim != packed_dim_info.packed_dim &&
+      packed_dim_info.outer_packed_dim_padded) {
+    const int64_t outer_dim_offset = packed_dim_info.outer_packed_dim + 1;
+    const int64_t outer_padded_dim_size =
+        utils::val_at(-outer_dim_offset, sizes);
+    padded_sizes.at(ndim_up4 - outer_dim_offset) =
+        utils::align_up_4(outer_padded_dim_size);
+  }
 
   return padded_sizes;
 }
 
 utils::uvec3 calculate_image_extents(
+    const vkapi::ScalarType dtype,
+    const PackedDimInfo& packed_dim_info,
     const std::vector<int64_t>& padded_sizes,
-    const utils::GPUMemoryLayout memory_layout,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim) {
+    const std::vector<int64_t>& axis_map) {
   utils::uvec3 extents({1, 1, 1});
 
+  const int64_t packed_dim_axis = axis_map.at(packed_dim_info.packed_dim);
+  const int64_t outer_packed_dim_axis =
+      axis_map.at(packed_dim_info.outer_packed_dim);
+
+  // If the packed dim is not padded to the next multiple of 4, then that means
+  // this tensor is using buffer storage and does not require texture extents.
+  const int64_t packed_dim_idx =
+      padded_sizes.size() - 1 - packed_dim_info.packed_dim;
+  if (padded_sizes.at(packed_dim_idx) % 4 != 0) {
+    return extents;
+  }
+
   // For high dimensional tensors, buffer storage must be used. No need to
   // compute image extents in this case.
   if (padded_sizes.size() > 4) {
@@ -222,25 +284,26 @@ utils::uvec3 calculate_image_extents(
   }
 
   // For "regular" tensor dtypes, 4 elements along the packed dim are packed
-  // into one texel (4-component vectorized type). However, for packed int8
-  // memory layouts, an additional level of packing is employed where 4 int8
-  // elements are packed into one int32, and then 4 int32 are packed into each
-  // ivec4 texel.
-  if (utils::is_packed_int8_layout(memory_layout)) {
-    // Each int in the ivec4 contains 4 channels. The overall ivec4 contains
-    // data for a 1Hx4Wx4C block of the input tensor.
-    if (memory_layout == utils::kPackedInt8_4W4C) {
-      VK_CHECK_COND(packed_dim == 2);
-      extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u);
+  // into one texel (4-component vectorized type). However, for kInt8x4 dtype,
+  // an additional level of packing is employed where 4 int8 elements are
+  // packed into one int32, and then 4 int32 are packed into each ivec4 texel.
+  if (dtype == vkapi::kInt8x4) {
+    // For layouts with only one packed dimension, loading an ivec4 texel from
+    // the texture loads 16 int8 values (4 int32 that each contain 4 int8).
+    if (packed_dim_info.outer_packed_dim == packed_dim_info.packed_dim) {
+      extents[packed_dim_axis] = utils::div_up(extents[packed_dim_axis], 16u);
     }
-    // Each int in the ivec4 contains 4 elements along the width dim. The
-    // overall ivec4 contains data for a 4Hx4W block of the input tensor.
-    else if (memory_layout == utils::kPackedInt8_4H4W) {
-      VK_CHECK_COND(packed_dim == 0);
-      extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u);
-    } else {
-      VK_THROW("Unhandled packed int8 memory layout!");
+    // Layouts with two packed dimension (e.g., 4W4C, 4H4W) load a 4x4 block of
+    // data from two dimensions with each ivec4 texel load, as opposed to 16
+    // adjacent values from a single dimension.
+    else {
+      VK_CHECK_COND(extents[outer_packed_dim_axis] % 4 == 0);
+      extents[outer_packed_dim_axis] /= 4;
+      VK_CHECK_COND(extents[packed_dim_axis] % 4 == 0);
+      extents[packed_dim_axis] /= 4;
     }
+  } else {
+    extents[packed_dim_axis] /= 4;
   }
 
   // axis_map[3] indicates the WHCN index of the dimension used for batch
@@ -251,9 +314,6 @@ utils::uvec3 calculate_image_extents(
   // Multiply the extents of the batch axis by the batch size.
   extents[batch_axis] *= padded_sizes.at(0);
 
-  VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
-  extents[axis_map.at(packed_dim)] /= 4;
-
   return extents;
 }
 
@@ -285,73 +345,43 @@ utils::uvec3 calculate_logical_limits(
  * directly from tensor sizes.
  */
 utils::uvec3 calculate_logical_limits(
-    const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim) {
+    const vkapi::ScalarType dtype,
+    const PackedDimInfo& packed_dim_info,
+    const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& axis_map) {
   return calculate_logical_limits(
-      calculate_image_extents(
-          calculate_padded_sizes(sizes, packed_dim),
-          memory_layout,
-          axis_map,
-          packed_dim),
+      calculate_image_extents(dtype, packed_dim_info, padded_sizes, axis_map),
       axis_map);
 }
 
+/*
+ * Calculate the number of elements that a GPU buffer would require to store the
+ * contents of a tensor.
+ */
 int64_t calculate_gpu_buffer_numel(
-    const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout,
-    const vkapi::ScalarType dtype) {
+    const vkapi::ScalarType dtype,
+    const PackedDimInfo& packed_dim_info,
+    const std::vector<int64_t>& padded_sizes) {
   size_t numel;
 
-  // Mirrors the logic in calculate_image_extents for packed int8 memory layouts
+  numel = utils::multiply_integers(padded_sizes);
+
+  // For this dtype, the data buffer is interpreted as an array of int32, where
+  // each int32 contains 4xint8 values. To account for this, the number of
+  // elements needs to be divided by 4.
   if (dtype == vkapi::kInt8x4) {
-    VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout));
-    std::vector<int64_t> blocks_in_dim =
-        flip_and_unsqueeze<int64_t>(sizes, kTensorSizes, 0);
-    // Each ivec4 contains data for a 1Hx4Wx4C block of the input
-    if (memory_layout == utils::kPackedInt8_4W4C) {
-      blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
-      blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]);
-    }
-    // Each ivec4 contains data for a 4Hx4W block of the input
-    else if (memory_layout == utils::kPackedInt8_4H4W) {
-      blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
-      blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]);
-    } else {
-      VK_THROW("Unhandled packed int8 memory layout!");
-    }
-    // Each block is represented as an ivec4, and the base dtype of the buffer
-    // is int. Therefore, need to multiply the number of blocks by 4 to obtain
-    // the number of int elements in the data buffer.
-    numel = utils::multiply_integers(blocks_in_dim) * 4;
-  }
-  // Case for "regular" dtypes/memory layouts
-  else {
-    numel = utils::multiply_integers(sizes);
-
-    // For 8-bit types, align to the next multiple of 4. For devices that do not
-    // support 8-bit storage buffers, the tensor data will be interpreted as an
-    // array of int32 instead.
-    if (vkapi::element_size(dtype) == 1) {
-      numel = utils::align_up_4(numel);
-    }
+    // Should already be a multiple of 4 due to padding the packed dimensions
+    VK_CHECK_COND(numel % 4 == 0);
+    numel /= 4;
   }
-  return numel;
-}
 
-int64_t calculate_staging_or_gpu_buffer_numel(
-    Context* const context,
-    const std::vector<int64_t>& sizes,
-    const utils::uvec3 image_extents,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout memory_layout,
-    const vkapi::ScalarType dtype) {
-  // For texture backed tensors, simply multiply the total number of texels by 4
-  if (storage_type != utils::kBuffer) {
-    return image_extents[0] * image_extents[1] * image_extents[2] * 4;
+  // For 8-bit types, align to the next multiple of 4. For devices that do not
+  // support 8-bit storage buffers, the tensor data will be interpreted as an
+  // array of int32 instead.
+  if (vkapi::element_size(dtype) == 1) {
+    numel = utils::align_up_4(numel);
   }
-  return calculate_gpu_buffer_numel(sizes, memory_layout, dtype);
+  return numel;
 }
 
 template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
@@ -365,13 +395,13 @@ int32_t pack_into_int32(const std::vector<T>& vec, const int32_t extra) {
 int32_t create_hashed_layout(
     const std::vector<int64_t>& dim_order,
     const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim,
+    const PackedDimInfo& packed_dim_info,
     const utils::StorageType storage_type) {
   if (storage_type == utils::kBuffer) {
     return pack_into_int32(
         flip_and_unsqueeze<int64_t>(dim_order, kTensorDimOrder, 0), 0);
   }
-  return pack_into_int32(axis_map, packed_dim);
+  return pack_into_int32(axis_map, packed_dim_info.packed_dim);
 }
 
 size_t calculate_max_ubo_nbytes(
@@ -500,24 +530,19 @@ vTensorStorage::vTensorStorage(
     const utils::StorageType storage_type,
     const utils::GPUMemoryLayout memory_layout,
     const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim,
-    const std::vector<int64_t>& sizes,
+    const PackedDimInfo& packed_dim_info,
+    const std::vector<int64_t>& padded_sizes,
     const vkapi::ScalarType dtype,
     const bool allocate_memory)
     : context_(context),
       storage_type_{storage_type},
       image_extents_(calculate_image_extents(
-          calculate_padded_sizes(sizes, packed_dim),
-          memory_layout,
-          axis_map,
-          packed_dim)),
-      buffer_length_{calculate_staging_or_gpu_buffer_numel(
-          context_,
-          sizes,
-          image_extents_,
-          storage_type,
-          memory_layout,
-          dtype)},
+          dtype,
+          packed_dim_info,
+          padded_sizes,
+          axis_map)),
+      buffer_length_{
+          calculate_gpu_buffer_numel(dtype, packed_dim_info, padded_sizes)},
       buffer_offset_{0},
       image_(allocate_image(
           context_,
@@ -634,18 +659,20 @@ vTensor::vTensor(
     const utils::GPUMemoryLayout memory_layout,
     const bool allocate_memory,
     const utils::AxisMapLayout axis_map_layout)
-    : dtype_(get_effective_scalar_type(dtype, memory_layout)),
+    : packed_dim_info_(calculate_packed_dim_info(memory_layout, storage_type)),
+      dtype_(get_effective_scalar_type(dtype, memory_layout)),
       // Calculate tensor metadata
       sizes_(sizes.begin(), sizes.end()),
-      packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
-      dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
+      padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)),
+      dim_order_(calculate_dim_order(sizes_.size(), packed_dim_info_)),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
-      strides_(calculate_strides(sizes, dim_order_)),
+      strides_(
+          calculate_strides(dtype_, sizes.size(), padded_sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
       hashed_layout_(create_hashed_layout(
           dim_order_,
           axis_map_,
-          packed_dim_,
+          packed_dim_info_,
           storage_type)),
       // Related to tensor metadata UBOs
       min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
@@ -659,8 +686,8 @@ vTensor::vTensor(
           storage_type,
           memory_layout,
           axis_map_,
-          packed_dim_,
-          sizes,
+          packed_dim_info_,
+          padded_sizes_,
           dtype_,
           allocate_memory)) {
   // uniform_data_ only valid for low dim tensors
@@ -683,10 +710,12 @@ vTensor::vTensor(
     const vkapi::VulkanImage& image,
     const utils::GPUMemoryLayout memory_layout,
     const utils::AxisMapLayout axis_map_layout)
-    : dtype_(vkapi::element_scalartype(image.format())),
+    : packed_dim_info_(
+          calculate_packed_dim_info(memory_layout, utils::kTexture3D)),
+      dtype_(vkapi::element_scalartype(image.format())),
       // Calculate tensor metadata
-      sizes_(calculate_sizes(image, memory_layout)),
-      packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
+      sizes_(calculate_sizes(image, packed_dim_info_)),
+      padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)),
       dim_order_(),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
       strides_(),
@@ -694,7 +723,7 @@ vTensor::vTensor(
       hashed_layout_(create_hashed_layout(
           dim_order_,
           axis_map_,
-          packed_dim_,
+          packed_dim_info_,
           utils::kTexture3D)),
       // Related to tensor metadata UBOs
       min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
@@ -713,10 +742,11 @@ vTensor::vTensor(
 }
 
 vTensor::vTensor(vTensor& other)
-    : dtype_(other.dtype_),
+    : packed_dim_info_{other.packed_dim_info_},
+      dtype_(other.dtype_),
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
-      packed_dim_{other.packed_dim_},
+      padded_sizes_(other.padded_sizes_.begin(), other.padded_sizes_.end()),
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
       axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
@@ -735,18 +765,20 @@ vTensor::vTensor(
     vTensor& other,
     const std::vector<int64_t>& sizes,
     const std::vector<int64_t>& dim_order)
-    : dtype_(other.dtype_),
+    : packed_dim_info_(other.packed_dim_info_),
+      dtype_(other.dtype_),
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      packed_dim_(other.packed_dim_),
+      padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)),
       dim_order_(dim_order.begin(), dim_order.end()),
       axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)),
-      strides_(calculate_strides(sizes_, dim_order_)),
+      strides_(
+          calculate_strides(dtype_, sizes_.size(), padded_sizes_, dim_order_)),
       numel_(other.numel_),
       hashed_layout_(create_hashed_layout(
           dim_order_,
           axis_map_,
-          packed_dim_,
+          packed_dim_info_,
           other.storage_type())),
       min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
       max_ubo_nbytes_{other.max_ubo_nbytes_},
@@ -840,15 +872,15 @@ vTensor::TextureMetadata::TextureMetadata(
     const std::vector<int64_t>& src_sizes,
     const TextureLimits& src_logical_limits,
     const std::vector<int64_t>& src_axis_map,
-    const int32_t src_packed_dim) {
-  update(src_sizes, src_logical_limits, src_axis_map, src_packed_dim);
+    const PackedDimInfo& src_packed_dim_info) {
+  update(src_sizes, src_logical_limits, src_axis_map, src_packed_dim_info);
 }
 
 void vTensor::TextureMetadata::update(
     const std::vector<int64_t>& src_sizes,
     const TextureLimits& src_logical_limits,
     const std::vector<int64_t>& src_axis_map,
-    const int32_t src_packed_dim) {
+    const PackedDimInfo& src_packed_dim_info) {
   // Convert sizes to flipped and unsqueezed format (fixed to 4 dimensions for
   // texture)
   std::vector<int32_t> fu_sizes =
@@ -877,7 +909,7 @@ void vTensor::TextureMetadata::update(
     axis_map[i] = 0;
   }
 
-  packed_dim = src_packed_dim;
+  packed_dim = src_packed_dim_info.packed_dim;
 }
 
 vkapi::VulkanImage& vTensor::image(
@@ -911,17 +943,36 @@ vkapi::VulkanBuffer& vTensor::buffer(
 }
 
 utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
+  // Check for tiled layouts (two-level packing) - only applicable for kInt8x4
+  if (dtype_ == vkapi::kInt8x4 &&
+      packed_dim_info_.outer_packed_dim != packed_dim_info_.packed_dim) {
+    // For 4W4C: packed_dim = Channels, outer_packed_dim = Width
+    if (packed_dim_info_.packed_dim == WHCN::kChannelsDim &&
+        packed_dim_info_.outer_packed_dim == WHCN::kWidthDim) {
+      return utils::kPackedInt8_4W4C;
+    }
+    // For 4H4W: packed_dim = Width, outer_packed_dim = Height
+    if (packed_dim_info_.packed_dim == WHCN::kWidthDim &&
+        packed_dim_info_.outer_packed_dim == WHCN::kHeightDim) {
+      return utils::kPackedInt8_4H4W;
+    }
+    VK_THROW("Invalid tiled layout configuration for kInt8x4 dtype");
+  }
+
+  // Single-level packing layouts
   if (dtype_ == vkapi::kInt8x4) {
-    switch (packed_dim_) {
+    switch (packed_dim_info_.packed_dim) {
       case WHCN::kChannelsDim:
-        return utils::kPackedInt8_4W4C;
+        return utils::kPackedInt8_4C;
       case WHCN::kWidthDim:
-        return utils::kPackedInt8_4H4W;
+        return utils::kPackedInt8_4W;
+      case WHCN::kHeightDim:
+        return utils::kPackedInt8_4H;
       default:
         VK_THROW("Invalid packed dim for Tensor with kInt8x4 type");
     }
   }
-  switch (packed_dim_) {
+  switch (packed_dim_info_.packed_dim) {
     case WHCN::kWidthDim:
       return utils::kWidthPacked;
     case WHCN::kHeightDim:
@@ -996,7 +1047,7 @@ const vkapi::BufferBindInfo vTensor::texture_meta_ubo() {
   size_t ubo_nbytes = sizeof(TextureMetadata);
   if (!texture_meta_.buffer()) {
     TextureLimits limits(logical_limits());
-    TextureMetadata data(sizes_, limits, axis_map_, packed_dim_);
+    TextureMetadata data(sizes_, limits, axis_map_, packed_dim_info_);
     texture_meta_ = ParamsBuffer(storage_->context_, data);
   }
   return vkapi::BufferBindInfo(texture_meta_.buffer(), 0, ubo_nbytes);
@@ -1049,7 +1100,8 @@ void vTensor::acquire_allocation(vkapi::Allocation&& allocation) {
 
 void vTensor::update_metadata() {
   numel_ = utils::multiply_integers(sizes_);
-  strides_ = calculate_strides(sizes_, dim_order_);
+  strides_ =
+      calculate_strides(dtype_, sizes_.size(), padded_sizes_, dim_order_);
 
   // Update uniform data if it has been modified
   if (sizes_.size() <= 4) {
@@ -1061,7 +1113,7 @@ void vTensor::update_metadata() {
     uniform_data_->strides_v =
         flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_);
     uniform_data_->logical_limits.limits = calculate_logical_limits(
-        sizes_, estimate_memory_layout(), axis_map_, packed_dim_);
+        dtype_, packed_dim_info_, padded_sizes_, axis_map_);
 
     if (sizes_uniform_offset_ != kUniformOffsetUnset) {
       uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
@@ -1088,21 +1140,17 @@ void vTensor::update_metadata() {
 
   if (texture_meta_.buffer()) {
     TextureMetadata data(
-        sizes_, uniform_data_->logical_limits, axis_map_, packed_dim_);
+        sizes_, uniform_data_->logical_limits, axis_map_, packed_dim_info_);
     texture_meta_.update(data);
   }
 }
 
 void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
-  utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout();
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
     utils::uvec3 virtual_extents = calculate_image_extents(
-        calculate_padded_sizes(sizes_, packed_dim_),
-        est_memory_layout,
-        axis_map_,
-        packed_dim_);
+        dtype_, packed_dim_info_, padded_sizes_, axis_map_);
 
     bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
     valid_resize =
@@ -1117,7 +1165,7 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
     // For buffer storage check that the current buffer is large enough for
     // the new sizes of the tensor.
     int64_t numel =
-        calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_);
+        calculate_gpu_buffer_numel(dtype_, packed_dim_info_, padded_sizes_);
     bool valid_resize =
         numel + storage_->buffer_offset_ <= storage_->buffer_length_;
     VK_CHECK_COND(
@@ -1137,11 +1185,12 @@ void vTensor::virtual_reconfigure(
 
   check_sizes(new_sizes);
   sizes_ = new_sizes;
+  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_);
   dim_order_ = new_dim_order;
 
   // Update the hashed layout because dim order is updated
-  hashed_layout_ =
-      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+  hashed_layout_ = create_hashed_layout(
+      dim_order_, axis_map_, packed_dim_info_, storage_type());
 
   update_metadata();
 }
@@ -1149,9 +1198,10 @@ void vTensor::virtual_reconfigure(
 void vTensor::virtual_clone(const vTensor& other) {
   VK_CHECK_COND(is_view_of(other));
   sizes_ = other.sizes_;
+  padded_sizes_ = other.padded_sizes_;
   dim_order_ = other.dim_order_;
   axis_map_ = other.axis_map_;
-  packed_dim_ = other.packed_dim_;
+  packed_dim_info_ = other.packed_dim_info_;
   hashed_layout_ = other.hashed_layout_;
 
   *uniform_data_ = *other.get_uniform_data();
@@ -1164,6 +1214,7 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
 
   check_sizes(new_sizes);
   sizes_ = new_sizes;
+  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_);
   update_metadata();
 }
 
@@ -1191,10 +1242,10 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
 
   const int dim0_whcn = sizes_.size() - 1 - dim0;
   const int dim1_whcn = sizes_.size() - 1 - dim1;
-  if (packed_dim_ == dim0_whcn) {
-    packed_dim_ = dim1_whcn;
-  } else if (packed_dim_ == dim1_whcn) {
-    packed_dim_ = dim0_whcn;
+  if (packed_dim_info_.packed_dim == dim0_whcn) {
+    packed_dim_info_.packed_dim = dim1_whcn;
+  } else if (packed_dim_info_.packed_dim == dim1_whcn) {
+    packed_dim_info_.packed_dim = dim0_whcn;
   }
 
   if (storage_type() == utils::kBuffer) {
@@ -1213,8 +1264,12 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
   }
 
   // Update the hashed layout because dim order / axis mpa is updated
-  hashed_layout_ =
-      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+  hashed_layout_ = create_hashed_layout(
+      dim_order_, axis_map_, packed_dim_info_, storage_type());
+
+  // Recalculate padded_sizes_ based on the new sizes and updated
+  // packed_dim_info
+  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_);
 
   update_metadata();
 }
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 967148b8dbe..9ccff1b38b8 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -21,6 +21,43 @@ namespace api {
 
 static constexpr size_t kTensorDimLimit = 8;
 
+/*
+ * PackedDimInfo encapsulates metadata about packed dimensions in GPU tensors.
+ * This includes information about which dimension is packed, whether it's
+ * padded, and tiled packing information for special layouts like 4W4C and 4H4W.
+ */
+struct PackedDimInfo {
+  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
+  // width, 1 for height, etc.). For texture backed tensors, this describes
+  // which dimension is packed along a texel. For buffer backed tensors, this
+  // describes which dimension has a stride of 1 (i.e. is last in the dim
+  // order).
+  int32_t packed_dim;
+  // Describes if the packed dimension is padded to a multiple of 4. This will
+  // be true for all tensors that use texture storage, and will also be true
+  // for the PACKED_PADDED memory layouts.
+  bool packed_dim_padded;
+  // Describes a second level of packing, if applicable (which will only apply
+  // to the 4W4C and 4H4W layouts). If there is no second level of packing,
+  // then this will be equal to packed_dim. Otherwise, it will represent the
+  // outer dim used to construct tiled packing. For example, 4W4C will have
+  // packed_dim = 2 and outer_packed_dim = 0.
+  int32_t outer_packed_dim;
+  // Whether the outer packed dim is padded to the next multiple of 4. This is
+  // true only for tiled layouts.
+  bool outer_packed_dim_padded;
+
+  PackedDimInfo(
+      int32_t dim,
+      bool dim_padded,
+      int32_t outer_dim,
+      bool outer_dim_padded)
+      : packed_dim(dim),
+        packed_dim_padded(dim_padded),
+        outer_packed_dim(outer_dim),
+        outer_packed_dim_padded(outer_dim_padded) {}
+};
+
 /*
  * Given a GPUMemoryLayout value, produce a dim order vector that matches the
  * given memory layout. The produced dim order vector will be in the NCHW
@@ -28,14 +65,16 @@ static constexpr size_t kTensorDimLimit = 8;
  */
 std::vector<int64_t> calculate_dim_order(
     const size_t ndim,
-    const int32_t packed_dim);
+    const PackedDimInfo& packed_dim_info);
 
 /*
  * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
  * dimension order, calculate the strides of the tensor.
  */
 std::vector<int64_t> calculate_strides(
-    const std::vector<int64_t>& sizes,
+    const vkapi::ScalarType dtype,
+    const size_t ndim,
+    const std::vector<int64_t>& padded_sizes,
     const std::vector<int64_t>& dim_order);
 
 /*
@@ -55,15 +94,16 @@ std::vector<int64_t> calculate_strides(
  */
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
-    const int32_t packed_dim);
+    const PackedDimInfo& packed_dim_info);
 
 /*
  * Calculate the image extents required of a texture backed tensor.
  */
 utils::uvec3 calculate_image_extents(
+    const vkapi::ScalarType dtype,
+    const PackedDimInfo& packed_dim_info,
     const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim);
+    const std::vector<int64_t>& axis_map);
 
 struct LastAccess {
   vkapi::PipelineStageFlags stage;
@@ -79,18 +119,6 @@ struct LastAccess {
       : stage{stage_flags}, access{access_flags} {}
 };
 
-/*
- * Calculate the number of elements that a GPU buffer would require to store the
- * contents of a tensor. This will depend on the storage type and dtype of the
- * tensor, as well as the features available on the device.
- */
-int64_t calculate_gpu_buffer_numel(
-    Context* const context,
-    const std::vector<int64_t>& sizes,
-    const utils::uvec3 image_extents,
-    const utils::StorageType storage_type,
-    const vkapi::ScalarType dtype);
-
 class vTensorStorage final {
  public:
   // Do not allow empty vTensorStorage construction
@@ -101,8 +129,8 @@ class vTensorStorage final {
       const utils::StorageType storage_type,
       const utils::GPUMemoryLayout memory_layout,
       const std::vector<int64_t>& axis_map,
-      const int32_t packed_dim,
-      const std::vector<int64_t>& sizes,
+      const PackedDimInfo& packed_dim_info,
+      const std::vector<int64_t>& padded_sizes,
       const vkapi::ScalarType dtype,
       const bool allocate_memory = true);
 
@@ -295,13 +323,13 @@ class vTensor final {
         const std::vector<int64_t>& sizes,
         const TextureLimits& logical_limits,
         const std::vector<int64_t>& axis_map,
-        const int32_t packed_dim);
+        const PackedDimInfo& packed_dim_info);
 
     void update(
         const std::vector<int64_t>& sizes,
         const TextureLimits& logical_limits,
         const std::vector<int64_t>& axis_map,
-        const int32_t packed_dim);
+        const PackedDimInfo& packed_dim_info);
   };
 
  private:
@@ -310,16 +338,14 @@ class vTensor final {
    * to construct a tensor.
    */
 
+  // Information about packed dimension padding and tiled packing
+  PackedDimInfo packed_dim_info_;
   // Whether the tensor has elements of type float, int, etc.
   vkapi::ScalarType dtype_;
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
-  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
-  // width, 1 for height, etc.). For texture backed tensors, this describes
-  // which dimension is packed along a texel. For buffer backed tensors, this
-  // describes which dimension has a stride of 1 (i.e. is last in the dim
-  // order).
-  int32_t packed_dim_;
+  // padded sizes of the tensor (pre-computed to avoid recalculation)
+  std::vector<int64_t> padded_sizes_;
 
   /*
    * "Layout" metadata. These describe with further detail how tensor data is
@@ -483,7 +509,11 @@ class vTensor final {
   utils::GPUMemoryLayout estimate_memory_layout() const;
 
   inline int32_t packed_dim() const {
-    return packed_dim_;
+    return packed_dim_info_.packed_dim;
+  }
+
+  inline const PackedDimInfo& packed_dim_info() const {
+    return packed_dim_info_;
   }
 
   /*
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 18e97d7b516..a7a2a09f1fc 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -442,6 +442,11 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().packed_dim();
   }
 
+  inline const api::PackedDimInfo& packed_dim_info_of(
+      const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().packed_dim_info();
+  }
+
   inline int32_t concat_dim_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().concat_dim();
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
index 0a4acb6cef3..1bc256ef6ef 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
@@ -108,17 +108,19 @@ utils::uvec3 concat_pick_global_wg_size(
   // Calculate what the image extents would be of a tensor with the input
   // volume's sizes. This produces the number of texels that would need to be
   // written to.
-  const int32_t packed_dim = graph->packed_dim_of(out);
+  const api::PackedDimInfo& packed_dim_info = graph->packed_dim_info_of(out);
   std::vector<int64_t> inp_volume_texel_sizes =
-      api::calculate_padded_sizes(inp_volume_sizes, packed_dim);
+      api::calculate_padded_sizes(inp_volume_sizes, packed_dim_info);
   // If the concat_dim is the same as the packed dim, and the concat_offset for
   // this input batch is not a multiple of 4, then the data from an input texel
   // may be split up between two output texels. For example:
   //                I0 , I1 , I2 , I2
   // O0 , O1 , O2 , X  | X  , X  , X ,  X
   // Therefore, 1 texel is added to the packed dim to account for this.
-  inp_volume_texel_sizes.at(3 - packed_dim) =
-      utils::div_up_4(inp_volume_texel_sizes.at(3 - packed_dim)) + 1;
+  inp_volume_texel_sizes.at(3 - packed_dim_info.packed_dim) =
+      utils::div_up_4(
+          inp_volume_texel_sizes.at(3 - packed_dim_info.packed_dim)) +
+      1;
 
   const uint32_t inp_volume_texel_numel =
       utils::multiply_integers(inp_volume_texel_sizes);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp
index 687b3923354..223f082d6a6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp
@@ -34,7 +34,11 @@ void add_tan_node(ComputeGraph& graph, const ValueRef in, const ValueRef out) {
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
 
   vkapi::ParamsBindList ubos({});
-  ubos.append({graph.logical_limits_ubo(out)});
+  if (graph.storage_type_of(out) == utils::kBuffer) {
+    ubos.append({graph.numel_ubo(out)});
+  } else {
+    ubos.append({graph.logical_limits_ubo(out)});
+  }
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
diff --git a/backends/vulkan/runtime/utils/StorageUtils.cpp b/backends/vulkan/runtime/utils/StorageUtils.cpp
index cfe3d9e159a..767c1294c39 100644
--- a/backends/vulkan/runtime/utils/StorageUtils.cpp
+++ b/backends/vulkan/runtime/utils/StorageUtils.cpp
@@ -13,6 +13,9 @@ namespace utils {
 
 bool is_packed_int8_layout(const GPUMemoryLayout layout) {
   switch (layout) {
+    case kPackedInt8_4W:
+    case kPackedInt8_4C:
+    case kPackedInt8_4H:
     case kPackedInt8_4W4C:
     case kPackedInt8_4H4W:
       return true;
diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h
index a269adccecb..6ae9fab768a 100644
--- a/backends/vulkan/runtime/utils/StorageUtils.h
+++ b/backends/vulkan/runtime/utils/StorageUtils.h
@@ -101,6 +101,11 @@ enum class GPUMemoryLayout : uint8_t {
    * 16 element block is loaded, rather than 4 elements along one dimension.
    */
 
+  // Single-dimension packed layouts (with padding)
+  TENSOR_PACKED_INT8_4W = 5u,
+  TENSOR_PACKED_INT8_4C = 6u,
+  TENSOR_PACKED_INT8_4H = 7u,
+
   TENSOR_PACKED_INT8_4W4C = 3u,
   TENSOR_PACKED_INT8_4H4W = 4u,
 };
@@ -114,6 +119,15 @@ static constexpr GPUMemoryLayout kHeightPacked =
 static constexpr GPUMemoryLayout kChannelsPacked =
     GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
 
+static constexpr GPUMemoryLayout kPackedInt8_4W =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4W;
+
+static constexpr GPUMemoryLayout kPackedInt8_4C =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4C;
+
+static constexpr GPUMemoryLayout kPackedInt8_4H =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4H;
+
 static constexpr GPUMemoryLayout kPackedInt8_4W4C =
     GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C;
 
@@ -129,6 +143,12 @@ T to_packed_dim(const GPUMemoryLayout layout) {
       return 1;
     case kChannelsPacked:
       return 2;
+    case kPackedInt8_4W:
+      return 0;
+    case kPackedInt8_4C:
+      return 2;
+    case kPackedInt8_4H:
+      return 1;
     case kPackedInt8_4W4C:
       return 2;
     case kPackedInt8_4H4W:
@@ -170,6 +190,15 @@ inline std::ostream& operator<<(
     case kChannelsPacked:
       os << "TENSOR_CHANNELS_PACKED";
       break;
+    case kPackedInt8_4W:
+      os << "TENSOR_PACKED_INT8_4W";
+      break;
+    case kPackedInt8_4C:
+      os << "TENSOR_PACKED_INT8_4C";
+      break;
+    case kPackedInt8_4H:
+      os << "TENSOR_PACKED_INT8_4H";
+      break;
     case kPackedInt8_4W4C:
       os << "TENSOR_PACKED_INT8_4W4C";
       break;
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 03619ec54af..e33ce1280e8 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -271,7 +271,8 @@ TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
     const size_t& ndim = std::get<0>(test_case);
     const int32_t packed_dim = std::get<1>(test_case);
     const auto& expected_dim_order = std::get<2>(test_case);
-    std::vector<int64_t> dim_order = calculate_dim_order(ndim, packed_dim);
+    api::PackedDimInfo packed_dim_info(packed_dim, false, packed_dim, false);
+    std::vector<int64_t> dim_order = calculate_dim_order(ndim, packed_dim_info);
 
     ASSERT_TRUE(dim_order == expected_dim_order);
   }
@@ -294,9 +295,14 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
          {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
       {
         const int32_t packed_dim = static_cast<int32_t>(layout);
+        api::PackedDimInfo packed_dim_info(
+            packed_dim, false, packed_dim, false);
         std::vector<int64_t> dim_order =
-            calculate_dim_order(sizes.size(), packed_dim);
-        std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
+            calculate_dim_order(sizes.size(), packed_dim_info);
+        std::vector<int64_t> padded_sizes =
+            calculate_padded_sizes(sizes, packed_dim_info);
+        std::vector<int64_t> strides = calculate_strides(
+            vkapi::kFloat, sizes.size(), padded_sizes, dim_order);
         int64_t numel = utils::multiply_integers(sizes);
 
         std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);