diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 5a1c445889e..3c798866ba5 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -14,6 +14,54 @@
 namespace vkcompute {
 namespace api {
 
+PackedDimInfo::PackedDimInfo(
+    int32_t dim,
+    bool dim_padded,
+    int32_t outer_dim,
+    bool outer_dim_padded)
+    : packed_dim(dim),
+      packed_dim_padded(dim_padded),
+      outer_packed_dim(outer_dim),
+      outer_packed_dim_padded(outer_dim_padded),
+      is_block_packed(outer_dim != dim) {
+  if (!is_block_packed) {
+    VK_CHECK_COND(!outer_packed_dim_padded);
+  }
+}
+
+PackedDimInfo calculate_packed_dim_info(
+    const utils::GPUMemoryLayout memory_layout,
+    const utils::StorageType storage_type) {
+  const int32_t packed_dim = utils::to_packed_dim<int32_t>(memory_layout);
+
+  // Determine if packed dimension is padded
+  const bool packed_dim_padded = storage_type != utils::kBuffer ||
+      memory_layout == utils::kPackedInt8_4W ||
+      memory_layout == utils::kPackedInt8_4C ||
+      memory_layout == utils::kPackedInt8_4H ||
+      memory_layout == utils::kPackedInt8_4W4C ||
+      memory_layout == utils::kPackedInt8_4H4W;
+
+  // Determine outer packed dimension (for block-packed layouts)
+  int32_t outer_packed_dim;
+  if (memory_layout == utils::kPackedInt8_4W4C) {
+    outer_packed_dim = 0; // Width
+  } else if (memory_layout == utils::kPackedInt8_4H4W) {
+    outer_packed_dim = 1; // Height
+  } else {
+    outer_packed_dim = packed_dim; // No block packing
+  }
+
+  // Determine if outer packed dimension is padded (only for block-packed
+  // layouts)
+  const bool outer_packed_dim_padded =
+      memory_layout == utils::kPackedInt8_4W4C ||
+      memory_layout == utils::kPackedInt8_4H4W;
+
+  return PackedDimInfo(
+      packed_dim, packed_dim_padded, outer_packed_dim, outer_packed_dim_padded);
+}
+
 /*
  * For PackedInt8 memory layouts, ensure that the scalar type used for the
  * tensor is kInt8x4. Otherwise, return the original scalar type.
@@ -35,24 +83,28 @@ vkapi::ScalarType get_effective_scalar_type(
  */
 std::vector<int64_t> calculate_sizes(
     const vkapi::VulkanImage& image,
-    const utils::GPUMemoryLayout memory_layout) {
+    const PackedDimInfo& packed_dim_info) {
   auto sizes = std::vector<int64_t>{
       image.extents().width, image.extents().height, image.extents().depth};
-  const auto packed_dim = utils::to_packed_dim<int32_t>(memory_layout);
-  sizes.at(packed_dim) *= 4;
+  sizes.at(packed_dim_info.packed_dim) *= 4;
   return sizes;
 }
 
+/*
+ * Given a GPUMemoryLayout value, produce a dim order vector that matches the
+ * given memory layout. The produced dim order vector will be in the NCHW
+ * dimension order
+ */
 std::vector<int64_t> calculate_dim_order(
     const size_t ndim,
-    const int32_t packed_dim) {
+    const PackedDimInfo& packed_dim_info) {
   // Special case for zero dim tensors
   if (ndim == 0) {
     return {0};
   }
   std::vector<int64_t> dim_order(ndim);
   // Explicitly convert ndim to signed to prevent underflow
-  int64_t last_dim = int64_t(ndim) - 1 - packed_dim;
+  int64_t last_dim = int64_t(ndim) - 1 - packed_dim_info.packed_dim;
 
   int64_t cur_dim = 0;
   for (int d = 0; d < ndim; ++d) {
@@ -69,24 +121,32 @@ std::vector<int64_t> calculate_dim_order(
   return dim_order;
 }
 
+/*
+ * Given the sizes of a tensor and the dim order of the tensor (both in NCHW
+ * dimension order), calculate the strides of the tensor.
+ */
 std::vector<int64_t> calculate_strides(
-    const std::vector<int64_t>& sizes,
+    const size_t ndim,
+    const std::vector<int64_t>& padded_sizes,
     const std::vector<int64_t>& dim_order) {
   // For zero dim tensors
-  if (sizes.size() == 0) {
+  if (ndim == 0) {
     return {1};
   }
 
-  size_t ndim = sizes.size();
   std::vector<int64_t> strides(ndim);
 
+  // padded_sizes has align_up_4(ndim) dimensions, with padding at the start
+  // We need to offset when indexing into padded_sizes
+  const int64_t offset = padded_sizes.size() - ndim;
+
   strides[dim_order[ndim - 1]] = 1;
   for (int32_t i = ndim - 2; i >= 0; --i) {
-    if (sizes[dim_order[i + 1]] == 0) {
+    if (padded_sizes[dim_order[i + 1] + offset] == 0) {
       strides[dim_order[i]] = strides[dim_order[i + 1]];
     } else {
       strides[dim_order[i]] =
-          strides[dim_order[i + 1]] * sizes[dim_order[i + 1]];
+          strides[dim_order[i + 1]] * padded_sizes[dim_order[i + 1] + offset];
     }
   }
 
@@ -177,9 +237,24 @@ utils::ivec4 flip_and_unsqueeze_ivec4(
   };
 }
 
+/*
+ * When stored on the GPU, tensor data may be stored using texels (i.e. a vector
+ * of 4 scalar values) in order to take advantage of the GPU's native
+ * vectorization capabilities. Furthermore, tensor metadata is passed in to
+ * shaders as ivec4 types.
+ *
+ * To accommodate these vectorized types, the sizes of a tensor will be modified
+ * for GPU storage in the following ways:
+ *
+ *   1. The dimensionality of the tensor will be padded to a multiple of 4.
+ *   2. The size of the packed dimension will be padded to a multiple of 4.
+ *
+ * The "packed dimension" is determined based on the utils::GPUMemoryLayout
+ * argument.
+ */
 std::vector<int64_t> calculate_padded_sizes(
     const std::vector<int64_t>& sizes,
-    const int32_t packed_dim) {
+    const PackedDimInfo& packed_dim_info) {
   int64_t ndim = sizes.size();
   if (ndim == 0) {
     ndim = 1;
@@ -192,21 +267,51 @@ std::vector<int64_t> calculate_padded_sizes(
     padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes);
   }
 
-  // Pad the packed dim to the next multiple of 4.
-  const int64_t dim_offset = packed_dim + 1;
-  const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
-  padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
+  // Pad the packed dim to the next multiple of 4 if specified.
+  // This is required for texture storage and packed layouts.
+  if (packed_dim_info.packed_dim_padded) {
+    const int64_t dim_offset = packed_dim_info.packed_dim + 1;
+    const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
+    padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
+  }
+
+  // For block-packed layouts (e.g., 4W4C, 4H4W), also pad the outer packed
+  // dimension if it's different from the inner packed dimension and is marked
+  // as padded.
+  if (packed_dim_info.is_block_packed &&
+      packed_dim_info.outer_packed_dim_padded) {
+    const int64_t outer_dim_offset = packed_dim_info.outer_packed_dim + 1;
+    const int64_t outer_padded_dim_size =
+        utils::val_at(-outer_dim_offset, sizes);
+    padded_sizes.at(ndim_up4 - outer_dim_offset) =
+        utils::align_up_4(outer_padded_dim_size);
+  }
 
   return padded_sizes;
 }
 
+/*
+ * Calculate the image extents required of a texture backed tensor.
+ */
 utils::uvec3 calculate_image_extents(
+    const vkapi::ScalarType dtype,
+    const PackedDimInfo& packed_dim_info,
     const std::vector<int64_t>& padded_sizes,
-    const utils::GPUMemoryLayout memory_layout,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim) {
+    const std::vector<int64_t>& axis_map) {
   utils::uvec3 extents({1, 1, 1});
 
+  const int64_t packed_dim_axis = axis_map.at(packed_dim_info.packed_dim);
+  const int64_t outer_packed_dim_axis =
+      axis_map.at(packed_dim_info.outer_packed_dim);
+
+  // If the packed dim is not padded to the next multiple of 4, then that means
+  // this tensor is using buffer storage and does not require texture extents.
+  const int64_t packed_dim_idx =
+      padded_sizes.size() - 1 - packed_dim_info.packed_dim;
+  if (padded_sizes.at(packed_dim_idx) % 4 != 0) {
+    return extents;
+  }
+
   // For high dimensional tensors, buffer storage must be used. No need to
   // compute image extents in this case.
   if (padded_sizes.size() > 4) {
@@ -222,25 +327,26 @@ utils::uvec3 calculate_image_extents(
   }
 
   // For "regular" tensor dtypes, 4 elements along the packed dim are packed
-  // into one texel (4-component vectorized type). However, for packed int8
-  // memory layouts, an additional level of packing is employed where 4 int8
-  // elements are packed into one int32, and then 4 int32 are packed into each
-  // ivec4 texel.
-  if (utils::is_packed_int8_layout(memory_layout)) {
-    // Each int in the ivec4 contains 4 channels. The overall ivec4 contains
-    // data for a 1Hx4Wx4C block of the input tensor.
-    if (memory_layout == utils::kPackedInt8_4W4C) {
-      VK_CHECK_COND(packed_dim == 2);
-      extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u);
+  // into one texel (4-component vectorized type). However, for kInt8x4 dtype,
+  // an additional level of packing is employed where 4 int8 elements are
+  // packed into one int32, and then 4 int32 are packed into each ivec4 texel.
+  if (dtype == vkapi::kInt8x4) {
+    // For layouts with only one packed dimension, loading an ivec4 texel from
+    // the texture loads 16 int8 values (4 int32 that each contain 4 int8).
+    if (!packed_dim_info.is_block_packed) {
+      extents[packed_dim_axis] = utils::div_up(extents[packed_dim_axis], 16u);
     }
-    // Each int in the ivec4 contains 4 elements along the width dim. The
-    // overall ivec4 contains data for a 4Hx4W block of the input tensor.
-    else if (memory_layout == utils::kPackedInt8_4H4W) {
-      VK_CHECK_COND(packed_dim == 0);
-      extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u);
-    } else {
-      VK_THROW("Unhandled packed int8 memory layout!");
+    // Layouts with two packed dimension (e.g., 4W4C, 4H4W) load a 4x4 block of
+    // data from two dimensions with each ivec4 texel load, as opposed to 16
+    // adjacent values from a single dimension.
+    else {
+      VK_CHECK_COND(extents[outer_packed_dim_axis] % 4 == 0);
+      extents[outer_packed_dim_axis] /= 4;
+      VK_CHECK_COND(extents[packed_dim_axis] % 4 == 0);
+      extents[packed_dim_axis] /= 4;
     }
+  } else {
+    extents[packed_dim_axis] /= 4;
   }
 
   // axis_map[3] indicates the WHCN index of the dimension used for batch
@@ -251,9 +357,6 @@ utils::uvec3 calculate_image_extents(
   // Multiply the extents of the batch axis by the batch size.
   extents[batch_axis] *= padded_sizes.at(0);
 
-  VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
-  extents[axis_map.at(packed_dim)] /= 4;
-
   return extents;
 }
 
@@ -285,73 +388,42 @@ utils::uvec3 calculate_logical_limits(
  * directly from tensor sizes.
  */
 utils::uvec3 calculate_logical_limits(
-    const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim) {
+    const vkapi::ScalarType dtype,
+    const PackedDimInfo& packed_dim_info,
+    const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& axis_map) {
   return calculate_logical_limits(
-      calculate_image_extents(
-          calculate_padded_sizes(sizes, packed_dim),
-          memory_layout,
-          axis_map,
-          packed_dim),
+      calculate_image_extents(dtype, packed_dim_info, padded_sizes, axis_map),
       axis_map);
 }
 
+/*
+ * Calculate the number of elements that a GPU buffer would require to store the
+ * contents of a tensor.
+ */
 int64_t calculate_gpu_buffer_numel(
-    const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout memory_layout,
-    const vkapi::ScalarType dtype) {
+    const vkapi::ScalarType dtype,
+    const std::vector<int64_t>& padded_sizes) {
   size_t numel;
 
-  // Mirrors the logic in calculate_image_extents for packed int8 memory layouts
+  numel = utils::multiply_integers(padded_sizes);
+
+  // For this dtype, the data buffer is interpreted as an array of int32, where
+  // each int32 contains 4xint8 values. To account for this, the number of
+  // elements needs to be divided by 4.
   if (dtype == vkapi::kInt8x4) {
-    VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout));
-    std::vector<int64_t> blocks_in_dim =
-        flip_and_unsqueeze<int64_t>(sizes, kTensorSizes, 0);
-    // Each ivec4 contains data for a 1Hx4Wx4C block of the input
-    if (memory_layout == utils::kPackedInt8_4W4C) {
-      blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
-      blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]);
-    }
-    // Each ivec4 contains data for a 4Hx4W block of the input
-    else if (memory_layout == utils::kPackedInt8_4H4W) {
-      blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
-      blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]);
-    } else {
-      VK_THROW("Unhandled packed int8 memory layout!");
-    }
-    // Each block is represented as an ivec4, and the base dtype of the buffer
-    // is int. Therefore, need to multiply the number of blocks by 4 to obtain
-    // the number of int elements in the data buffer.
-    numel = utils::multiply_integers(blocks_in_dim) * 4;
+    // Should already be a multiple of 4 due to padding the packed dimensions
+    VK_CHECK_COND(numel % 4 == 0);
+    numel /= 4;
   }
-  // Case for "regular" dtypes/memory layouts
-  else {
-    numel = utils::multiply_integers(sizes);
-
-    // For 8-bit types, align to the next multiple of 4. For devices that do not
-    // support 8-bit storage buffers, the tensor data will be interpreted as an
-    // array of int32 instead.
-    if (vkapi::element_size(dtype) == 1) {
-      numel = utils::align_up_4(numel);
-    }
-  }
-  return numel;
-}
 
-int64_t calculate_staging_or_gpu_buffer_numel(
-    Context* const context,
-    const std::vector<int64_t>& sizes,
-    const utils::uvec3 image_extents,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout memory_layout,
-    const vkapi::ScalarType dtype) {
-  // For texture backed tensors, simply multiply the total number of texels by 4
-  if (storage_type != utils::kBuffer) {
-    return image_extents[0] * image_extents[1] * image_extents[2] * 4;
+  // For 8-bit types, align to the next multiple of 4. For devices that do not
+  // support 8-bit storage buffers, the tensor data will be interpreted as an
+  // array of int32 instead.
+  if (vkapi::element_size(dtype) == 1) {
+    numel = utils::align_up_4(numel);
   }
-  return calculate_gpu_buffer_numel(sizes, memory_layout, dtype);
+  return numel;
 }
 
 template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
@@ -365,13 +437,13 @@ int32_t pack_into_int32(const std::vector<T>& vec, const int32_t extra) {
 int32_t create_hashed_layout(
     const std::vector<int64_t>& dim_order,
     const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim,
+    const PackedDimInfo& packed_dim_info,
     const utils::StorageType storage_type) {
   if (storage_type == utils::kBuffer) {
     return pack_into_int32(
         flip_and_unsqueeze<int64_t>(dim_order, kTensorDimOrder, 0), 0);
   }
-  return pack_into_int32(axis_map, packed_dim);
+  return pack_into_int32(axis_map, packed_dim_info.packed_dim);
 }
 
 size_t calculate_max_ubo_nbytes(
@@ -498,26 +570,20 @@ vkapi::VulkanBuffer allocate_buffer(
 vTensorStorage::vTensorStorage(
     Context* const context,
     const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout memory_layout,
     const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim,
-    const std::vector<int64_t>& sizes,
+    const PackedDimInfo& packed_dim_info,
+    const std::vector<int64_t>& padded_sizes,
     const vkapi::ScalarType dtype,
+    const int64_t physical_numel,
     const bool allocate_memory)
     : context_(context),
       storage_type_{storage_type},
       image_extents_(calculate_image_extents(
-          calculate_padded_sizes(sizes, packed_dim),
-          memory_layout,
-          axis_map,
-          packed_dim)),
-      buffer_length_{calculate_staging_or_gpu_buffer_numel(
-          context_,
-          sizes,
-          image_extents_,
-          storage_type,
-          memory_layout,
-          dtype)},
+          dtype,
+          packed_dim_info,
+          padded_sizes,
+          axis_map)),
+      buffer_length_{physical_numel},
       buffer_offset_{0},
       image_(allocate_image(
           context_,
@@ -634,18 +700,20 @@ vTensor::vTensor(
     const utils::GPUMemoryLayout memory_layout,
     const bool allocate_memory,
     const utils::AxisMapLayout axis_map_layout)
-    : dtype_(get_effective_scalar_type(dtype, memory_layout)),
+    : packed_dim_info_(calculate_packed_dim_info(memory_layout, storage_type)),
+      dtype_(get_effective_scalar_type(dtype, memory_layout)),
       // Calculate tensor metadata
       sizes_(sizes.begin(), sizes.end()),
-      packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
-      dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
+      padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)),
+      dim_order_(calculate_dim_order(sizes_.size(), packed_dim_info_)),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
-      strides_(calculate_strides(sizes, dim_order_)),
+      strides_(calculate_strides(sizes.size(), padded_sizes_, dim_order_)),
       numel_(utils::multiply_integers(sizes_)),
+      physical_numel_(calculate_gpu_buffer_numel(dtype_, padded_sizes_)),
       hashed_layout_(create_hashed_layout(
           dim_order_,
           axis_map_,
-          packed_dim_,
+          packed_dim_info_,
           storage_type)),
       // Related to tensor metadata UBOs
       min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
@@ -657,11 +725,11 @@ vTensor::vTensor(
       storage_(std::make_shared<vTensorStorage>(
           context,
           storage_type,
-          memory_layout,
           axis_map_,
-          packed_dim_,
-          sizes,
+          packed_dim_info_,
+          padded_sizes_,
           dtype_,
+          physical_numel_,
           allocate_memory)) {
   // uniform_data_ only valid for low dim tensors
   if (sizes.size() <= 4) {
@@ -683,18 +751,21 @@ vTensor::vTensor(
     const vkapi::VulkanImage& image,
     const utils::GPUMemoryLayout memory_layout,
     const utils::AxisMapLayout axis_map_layout)
-    : dtype_(vkapi::element_scalartype(image.format())),
+    : packed_dim_info_(
+          calculate_packed_dim_info(memory_layout, utils::kTexture3D)),
+      dtype_(vkapi::element_scalartype(image.format())),
       // Calculate tensor metadata
-      sizes_(calculate_sizes(image, memory_layout)),
-      packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
+      sizes_(calculate_sizes(image, packed_dim_info_)),
+      padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)),
       dim_order_(),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
       strides_(),
       numel_(utils::multiply_integers(sizes_)),
+      physical_numel_(calculate_gpu_buffer_numel(dtype_, padded_sizes_)),
       hashed_layout_(create_hashed_layout(
           dim_order_,
           axis_map_,
-          packed_dim_,
+          packed_dim_info_,
           utils::kTexture3D)),
       // Related to tensor metadata UBOs
       min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
@@ -713,14 +784,16 @@ vTensor::vTensor(
 }
 
 vTensor::vTensor(vTensor& other)
-    : dtype_(other.dtype_),
+    : packed_dim_info_{other.packed_dim_info_},
+      dtype_(other.dtype_),
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
-      packed_dim_{other.packed_dim_},
+      padded_sizes_(other.padded_sizes_.begin(), other.padded_sizes_.end()),
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
       axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
       numel_(other.numel_),
+      physical_numel_(other.physical_numel_),
       hashed_layout_(other.hashed_layout_),
       min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
       max_ubo_nbytes_{other.max_ubo_nbytes_},
@@ -735,18 +808,20 @@ vTensor::vTensor(
     vTensor& other,
     const std::vector<int64_t>& sizes,
     const std::vector<int64_t>& dim_order)
-    : dtype_(other.dtype_),
+    : packed_dim_info_(other.packed_dim_info_),
+      dtype_(other.dtype_),
       // Copy tensor size metadata
       sizes_(sizes.begin(), sizes.end()),
-      packed_dim_(other.packed_dim_),
+      padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_info_)),
       dim_order_(dim_order.begin(), dim_order.end()),
       axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)),
-      strides_(calculate_strides(sizes_, dim_order_)),
-      numel_(other.numel_),
+      strides_(calculate_strides(sizes_.size(), padded_sizes_, dim_order_)),
+      numel_(utils::multiply_integers(sizes_)),
+      physical_numel_(calculate_gpu_buffer_numel(dtype_, padded_sizes_)),
       hashed_layout_(create_hashed_layout(
           dim_order_,
           axis_map_,
-          packed_dim_,
+          packed_dim_info_,
           other.storage_type())),
       min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
       max_ubo_nbytes_{other.max_ubo_nbytes_},
@@ -755,11 +830,7 @@ vTensor::vTensor(
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(UniformData{
-      static_cast<size_t>(utils::multiply_integers(sizes_)),
-      sizes_,
-      dim_order_,
-      strides_,
-      other.logical_limits()});
+      numel_, sizes_, dim_order_, strides_, other.logical_limits()});
 
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "new dim order provided is invalid");
@@ -840,15 +911,15 @@ vTensor::TextureMetadata::TextureMetadata(
     const std::vector<int64_t>& src_sizes,
     const TextureLimits& src_logical_limits,
     const std::vector<int64_t>& src_axis_map,
-    const int32_t src_packed_dim) {
-  update(src_sizes, src_logical_limits, src_axis_map, src_packed_dim);
+    const PackedDimInfo& src_packed_dim_info) {
+  update(src_sizes, src_logical_limits, src_axis_map, src_packed_dim_info);
 }
 
 void vTensor::TextureMetadata::update(
     const std::vector<int64_t>& src_sizes,
     const TextureLimits& src_logical_limits,
     const std::vector<int64_t>& src_axis_map,
-    const int32_t src_packed_dim) {
+    const PackedDimInfo& src_packed_dim_info) {
   // Convert sizes to flipped and unsqueezed format (fixed to 4 dimensions for
   // texture)
   std::vector<int32_t> fu_sizes =
@@ -877,7 +948,7 @@ void vTensor::TextureMetadata::update(
     axis_map[i] = 0;
   }
 
-  packed_dim = src_packed_dim;
+  packed_dim = src_packed_dim_info.packed_dim;
 }
 
 vkapi::VulkanImage& vTensor::image(
@@ -911,17 +982,36 @@ vkapi::VulkanBuffer& vTensor::buffer(
 }
 
 utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
+  // Check for block-packed layouts (two-level packing) - only applicable for
+  // kInt8x4
+  if (dtype_ == vkapi::kInt8x4 && packed_dim_info_.is_block_packed) {
+    // For 4W4C: packed_dim = Channels, outer_packed_dim = Width
+    if (packed_dim_info_.packed_dim == WHCN::kChannelsDim &&
+        packed_dim_info_.outer_packed_dim == WHCN::kWidthDim) {
+      return utils::kPackedInt8_4W4C;
+    }
+    // For 4H4W: packed_dim = Width, outer_packed_dim = Height
+    if (packed_dim_info_.packed_dim == WHCN::kWidthDim &&
+        packed_dim_info_.outer_packed_dim == WHCN::kHeightDim) {
+      return utils::kPackedInt8_4H4W;
+    }
+    VK_THROW("Invalid block-packed layout configuration for kInt8x4 dtype");
+  }
+
+  // Single-level packing layouts
   if (dtype_ == vkapi::kInt8x4) {
-    switch (packed_dim_) {
+    switch (packed_dim_info_.packed_dim) {
       case WHCN::kChannelsDim:
-        return utils::kPackedInt8_4W4C;
+        return utils::kPackedInt8_4C;
       case WHCN::kWidthDim:
-        return utils::kPackedInt8_4H4W;
+        return utils::kPackedInt8_4W;
+      case WHCN::kHeightDim:
+        return utils::kPackedInt8_4H;
       default:
         VK_THROW("Invalid packed dim for Tensor with kInt8x4 type");
     }
   }
-  switch (packed_dim_) {
+  switch (packed_dim_info_.packed_dim) {
     case WHCN::kWidthDim:
       return utils::kWidthPacked;
     case WHCN::kHeightDim:
@@ -996,7 +1086,7 @@ const vkapi::BufferBindInfo vTensor::texture_meta_ubo() {
   size_t ubo_nbytes = sizeof(TextureMetadata);
   if (!texture_meta_.buffer()) {
     TextureLimits limits(logical_limits());
-    TextureMetadata data(sizes_, limits, axis_map_, packed_dim_);
+    TextureMetadata data(sizes_, limits, axis_map_, packed_dim_info_);
     texture_meta_ = ParamsBuffer(storage_->context_, data);
   }
   return vkapi::BufferBindInfo(texture_meta_.buffer(), 0, ubo_nbytes);
@@ -1049,7 +1139,8 @@ void vTensor::acquire_allocation(vkapi::Allocation&& allocation) {
 
 void vTensor::update_metadata() {
   numel_ = utils::multiply_integers(sizes_);
-  strides_ = calculate_strides(sizes_, dim_order_);
+  physical_numel_ = calculate_gpu_buffer_numel(dtype_, padded_sizes_);
+  strides_ = calculate_strides(sizes_.size(), padded_sizes_, dim_order_);
 
   // Update uniform data if it has been modified
   if (sizes_.size() <= 4) {
@@ -1061,7 +1152,7 @@ void vTensor::update_metadata() {
     uniform_data_->strides_v =
         flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_);
     uniform_data_->logical_limits.limits = calculate_logical_limits(
-        sizes_, estimate_memory_layout(), axis_map_, packed_dim_);
+        dtype_, packed_dim_info_, padded_sizes_, axis_map_);
 
     if (sizes_uniform_offset_ != kUniformOffsetUnset) {
       uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
@@ -1088,21 +1179,17 @@ void vTensor::update_metadata() {
 
   if (texture_meta_.buffer()) {
     TextureMetadata data(
-        sizes_, uniform_data_->logical_limits, axis_map_, packed_dim_);
+        sizes_, uniform_data_->logical_limits, axis_map_, packed_dim_info_);
     texture_meta_.update(data);
   }
 }
 
 void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
-  utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout();
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
     utils::uvec3 virtual_extents = calculate_image_extents(
-        calculate_padded_sizes(sizes_, packed_dim_),
-        est_memory_layout,
-        axis_map_,
-        packed_dim_);
+        dtype_, packed_dim_info_, padded_sizes_, axis_map_);
 
     bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
     valid_resize =
@@ -1116,10 +1203,10 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
   } else {
     // For buffer storage check that the current buffer is large enough for
     // the new sizes of the tensor.
-    int64_t numel =
-        calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_);
+    int64_t gpu_buffer_numel =
+        calculate_gpu_buffer_numel(dtype_, padded_sizes_);
     bool valid_resize =
-        numel + storage_->buffer_offset_ <= storage_->buffer_length_;
+        gpu_buffer_numel + storage_->buffer_offset_ <= storage_->buffer_length_;
     VK_CHECK_COND(
         valid_resize,
         "tensor sizes requires a larger buffer than the current one.");
@@ -1137,11 +1224,12 @@ void vTensor::virtual_reconfigure(
 
   check_sizes(new_sizes);
   sizes_ = new_sizes;
+  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_);
   dim_order_ = new_dim_order;
 
   // Update the hashed layout because dim order is updated
-  hashed_layout_ =
-      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+  hashed_layout_ = create_hashed_layout(
+      dim_order_, axis_map_, packed_dim_info_, storage_type());
 
   update_metadata();
 }
@@ -1149,9 +1237,10 @@ void vTensor::virtual_reconfigure(
 void vTensor::virtual_clone(const vTensor& other) {
   VK_CHECK_COND(is_view_of(other));
   sizes_ = other.sizes_;
+  padded_sizes_ = other.padded_sizes_;
   dim_order_ = other.dim_order_;
   axis_map_ = other.axis_map_;
-  packed_dim_ = other.packed_dim_;
+  packed_dim_info_ = other.packed_dim_info_;
   hashed_layout_ = other.hashed_layout_;
 
   *uniform_data_ = *other.get_uniform_data();
@@ -1164,6 +1253,7 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
 
   check_sizes(new_sizes);
   sizes_ = new_sizes;
+  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_);
   update_metadata();
 }
 
@@ -1187,14 +1277,34 @@ void transpose_dim_order_inplace(
 }
 
 void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
-  std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1);
-
   const int dim0_whcn = sizes_.size() - 1 - dim0;
   const int dim1_whcn = sizes_.size() - 1 - dim1;
-  if (packed_dim_ == dim0_whcn) {
-    packed_dim_ = dim1_whcn;
-  } else if (packed_dim_ == dim1_whcn) {
-    packed_dim_ = dim0_whcn;
+
+  // For block-packed layouts, do not allow transposition if either packed_dim
+  // or outer_packed_dim is one of the dims being transposed
+  if (packed_dim_info_.is_block_packed) {
+    VK_CHECK_COND(
+        packed_dim_info_.packed_dim != dim0_whcn &&
+        packed_dim_info_.packed_dim != dim1_whcn);
+    VK_CHECK_COND(
+        packed_dim_info_.outer_packed_dim != dim0_whcn &&
+        packed_dim_info_.outer_packed_dim != dim1_whcn);
+  }
+
+  std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1);
+
+  // Update packed_dim and outer_packed_dim if they match one of the transposed
+  // dims
+  if (packed_dim_info_.packed_dim == dim0_whcn) {
+    packed_dim_info_.packed_dim = dim1_whcn;
+  } else if (packed_dim_info_.packed_dim == dim1_whcn) {
+    packed_dim_info_.packed_dim = dim0_whcn;
+  }
+
+  if (packed_dim_info_.outer_packed_dim == dim0_whcn) {
+    packed_dim_info_.outer_packed_dim = dim1_whcn;
+  } else if (packed_dim_info_.outer_packed_dim == dim1_whcn) {
+    packed_dim_info_.outer_packed_dim = dim0_whcn;
   }
 
   if (storage_type() == utils::kBuffer) {
@@ -1212,9 +1322,13 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
     }
   }
 
-  // Update the hashed layout because dim order / axis mpa is updated
-  hashed_layout_ =
-      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+  // Update the hashed layout because dim order / axis map is updated
+  hashed_layout_ = create_hashed_layout(
+      dim_order_, axis_map_, packed_dim_info_, storage_type());
+
+  // Recalculate padded_sizes_ based on the new sizes and updated
+  // packed_dim_info
+  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_info_);
 
   update_metadata();
 }
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 967148b8dbe..b3c7184d2b3 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -22,48 +22,40 @@ namespace api {
 static constexpr size_t kTensorDimLimit = 8;
 
 /*
- * Given a GPUMemoryLayout value, produce a dim order vector that matches the
- * given memory layout. The produced dim order vector will be in the NCHW
- * dimension order
+ * PackedDimInfo encapsulates metadata about packed dimensions in GPU tensors.
+ * This includes information about which dimension is packed, whether it's
+ * padded, and block packing information for special layouts like 4W4C and 4H4W.
  */
-std::vector<int64_t> calculate_dim_order(
-    const size_t ndim,
-    const int32_t packed_dim);
-
-/*
- * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
- * dimension order, calculate the strides of the tensor.
- */
-std::vector<int64_t> calculate_strides(
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& dim_order);
-
-/*
- * When stored on the GPU, tensor data is stored using texels (i.e. a vector of
- * 4 scalar values) in order to take advantage of the GPU's native vectorization
- * capabilities. Furthermore, tensor metadata is passed in to shaders as ivec4
- * types.
- *
- * To accommodate these vectorized types, the sizes of a tensor will be modified
- * for GPU storage in the following ways:
- *
- *   1. The dimensionality of the tensor will be padded to a multiple of 4.
- *   2. The size of the packed dimension will be padded to a multiple of 4.
- *
- * The "packed dimension" is determined based on the utils::GPUMemoryLayout
- * argument.
- */
-std::vector<int64_t> calculate_padded_sizes(
-    const std::vector<int64_t>& sizes,
-    const int32_t packed_dim);
-
-/*
- * Calculate the image extents required of a texture backed tensor.
- */
-utils::uvec3 calculate_image_extents(
-    const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim);
+struct PackedDimInfo {
+  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
+  // width, 1 for height, etc.). For texture backed tensors, this describes
+  // which dimension is packed along a texel. For buffer backed tensors, this
+  // describes which dimension has a stride of 1 (i.e. is last in the dim
+  // order).
+  int32_t packed_dim;
+  // Describes if the packed dimension is padded to a multiple of 4. This will
+  // be true for all tensors that use texture storage, and will also be true
+  // for the PACKED_PADDED memory layouts.
+  bool packed_dim_padded;
+  // Describes a second level of packing, if applicable (which will only apply
+  // to the 4W4C and 4H4W layouts). If there is no second level of packing,
+  // then this will be equal to packed_dim. Otherwise, it will represent the
+  // outer dim used to construct block packing. For example, 4W4C will have
+  // packed_dim = 2 and outer_packed_dim = 0.
+  int32_t outer_packed_dim;
+  // Whether the outer packed dim is padded to the next multiple of 4. This is
+  // true only for block-packed layouts.
+  bool outer_packed_dim_padded;
+  // True if this layout uses block packing (i.e., outer_packed_dim !=
+  // packed_dim). Block packing is used for layouts like 4W4C and 4H4W.
+  bool is_block_packed;
+
+  PackedDimInfo(
+      int32_t dim,
+      bool dim_padded,
+      int32_t outer_dim,
+      bool outer_dim_padded);
+};
 
 struct LastAccess {
   vkapi::PipelineStageFlags stage;
@@ -79,18 +71,6 @@ struct LastAccess {
       : stage{stage_flags}, access{access_flags} {}
 };
 
-/*
- * Calculate the number of elements that a GPU buffer would require to store the
- * contents of a tensor. This will depend on the storage type and dtype of the
- * tensor, as well as the features available on the device.
- */
-int64_t calculate_gpu_buffer_numel(
-    Context* const context,
-    const std::vector<int64_t>& sizes,
-    const utils::uvec3 image_extents,
-    const utils::StorageType storage_type,
-    const vkapi::ScalarType dtype);
-
 class vTensorStorage final {
  public:
   // Do not allow empty vTensorStorage construction
@@ -99,11 +79,11 @@ class vTensorStorage final {
   vTensorStorage(
       Context* context,
       const utils::StorageType storage_type,
-      const utils::GPUMemoryLayout memory_layout,
       const std::vector<int64_t>& axis_map,
-      const int32_t packed_dim,
-      const std::vector<int64_t>& sizes,
+      const PackedDimInfo& packed_dim_info,
+      const std::vector<int64_t>& padded_sizes,
       const vkapi::ScalarType dtype,
+      const int64_t physical_numel,
       const bool allocate_memory = true);
 
   vTensorStorage(Context* const context, const vkapi::VulkanImage& image);
@@ -295,13 +275,13 @@ class vTensor final {
         const std::vector<int64_t>& sizes,
         const TextureLimits& logical_limits,
         const std::vector<int64_t>& axis_map,
-        const int32_t packed_dim);
+        const PackedDimInfo& packed_dim_info);
 
     void update(
         const std::vector<int64_t>& sizes,
         const TextureLimits& logical_limits,
         const std::vector<int64_t>& axis_map,
-        const int32_t packed_dim);
+        const PackedDimInfo& packed_dim_info);
   };
 
  private:
@@ -310,16 +290,14 @@ class vTensor final {
    * to construct a tensor.
    */
 
+  // Information about packed dimension padding and block packing
+  PackedDimInfo packed_dim_info_;
   // Whether the tensor has elements of type float, int, etc.
   vkapi::ScalarType dtype_;
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
-  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
-  // width, 1 for height, etc.). For texture backed tensors, this describes
-  // which dimension is packed along a texel. For buffer backed tensors, this
-  // describes which dimension has a stride of 1 (i.e. is last in the dim
-  // order).
-  int32_t packed_dim_;
+  // padded sizes of the tensor (pre-computed to avoid recalculation)
+  std::vector<int64_t> padded_sizes_;
 
   /*
    * "Layout" metadata. These describe with further detail how tensor data is
@@ -353,6 +331,10 @@ class vTensor final {
   // number of elements based on the canonical sizes
   size_t numel_;
 
+  // number of elements required for GPU buffer storage (with padding/packing)
+  // This is pre-computed to avoid recomputing calculate_gpu_buffer_numel
+  int64_t physical_numel_;
+
   // For texture backed tensors, this int32 contains the axis map data packed
   // into a single int32. For buffer backed tensors, this int32 contains the
   // wchn dim order data packed into a single int32.
@@ -483,7 +465,11 @@ class vTensor final {
   utils::GPUMemoryLayout estimate_memory_layout() const;
 
   inline int32_t packed_dim() const {
-    return packed_dim_;
+    return packed_dim_info_.packed_dim;
+  }
+
+  inline const PackedDimInfo& packed_dim_info() const {
+    return packed_dim_info_;
   }
 
   /*
@@ -514,10 +500,22 @@ class vTensor final {
     return strides_;
   }
 
+  inline const std::vector<int64_t>& padded_sizes() const {
+    return padded_sizes_;
+  }
+
   inline size_t numel() const {
     return numel_;
   }
 
+  inline int64_t physical_numel() const {
+    return physical_numel_;
+  }
+
+  inline utils::uvec3 image_extents() const {
+    return storage_->image_extents_;
+  }
+
   inline size_t nbytes() const {
     return element_size(dtype()) * numel();
   }
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 18e97d7b516..5b0e66030c8 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -362,6 +362,10 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().staging_buffer_numel();
   }
 
+  inline int64_t physical_numel_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().physical_numel();
+  }
+
   inline utils::StorageType storage_type_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().storage_type();
   }
@@ -442,6 +446,11 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().packed_dim();
   }
 
+  inline const api::PackedDimInfo& packed_dim_info_of(
+      const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().packed_dim_info();
+  }
+
   inline int32_t concat_dim_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().concat_dim();
   }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
index 0a4acb6cef3..1923757afbd 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
@@ -108,17 +108,19 @@ utils::uvec3 concat_pick_global_wg_size(
   // Calculate what the image extents would be of a tensor with the input
   // volume's sizes. This produces the number of texels that would need to be
   // written to.
-  const int32_t packed_dim = graph->packed_dim_of(out);
+
+  const int32_t packed_dim_idx = graph->packed_dim_of(out);
   std::vector<int64_t> inp_volume_texel_sizes =
-      api::calculate_padded_sizes(inp_volume_sizes, packed_dim);
+      api::flip_and_unsqueeze<int64_t>(inp_volume_sizes, api::kTensorSizes, 1);
+
   // If the concat_dim is the same as the packed dim, and the concat_offset for
   // this input batch is not a multiple of 4, then the data from an input texel
   // may be split up between two output texels. For example:
   //                I0 , I1 , I2 , I2
   // O0 , O1 , O2 , X  | X  , X  , X ,  X
   // Therefore, 1 texel is added to the packed dim to account for this.
-  inp_volume_texel_sizes.at(3 - packed_dim) =
-      utils::div_up_4(inp_volume_texel_sizes.at(3 - packed_dim)) + 1;
+  inp_volume_texel_sizes.at(packed_dim_idx) =
+      utils::div_up_4(inp_volume_texel_sizes.at(packed_dim_idx)) + 1;
 
   const uint32_t inp_volume_texel_numel =
       utils::multiply_integers(inp_volume_texel_sizes);
@@ -324,7 +326,7 @@ void add_concat_node(
           {1u, 1u, 1u},
           {1u, 1u, 1u},
           // Inputs and Outputs
-          {{concat_offset, vkapi::kWrite}},
+          {{concat_offset, vkapi::kReadWrite}},
           // Parameter buffers
           param_buffers,
           // Push Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp
index 687b3923354..223f082d6a6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp
@@ -34,7 +34,11 @@ void add_tan_node(ComputeGraph& graph, const ValueRef in, const ValueRef out) {
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
 
   vkapi::ParamsBindList ubos({});
-  ubos.append({graph.logical_limits_ubo(out)});
+  if (graph.storage_type_of(out) == utils::kBuffer) {
+    ubos.append({graph.numel_ubo(out)});
+  } else {
+    ubos.append({graph.logical_limits_ubo(out)});
+  }
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
diff --git a/backends/vulkan/runtime/utils/StorageUtils.cpp b/backends/vulkan/runtime/utils/StorageUtils.cpp
index cfe3d9e159a..767c1294c39 100644
--- a/backends/vulkan/runtime/utils/StorageUtils.cpp
+++ b/backends/vulkan/runtime/utils/StorageUtils.cpp
@@ -13,6 +13,9 @@ namespace utils {
 
 bool is_packed_int8_layout(const GPUMemoryLayout layout) {
   switch (layout) {
+    case kPackedInt8_4W:
+    case kPackedInt8_4C:
+    case kPackedInt8_4H:
     case kPackedInt8_4W4C:
     case kPackedInt8_4H4W:
       return true;
diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h
index a269adccecb..45b1529f5b0 100644
--- a/backends/vulkan/runtime/utils/StorageUtils.h
+++ b/backends/vulkan/runtime/utils/StorageUtils.h
@@ -101,6 +101,14 @@ enum class GPUMemoryLayout : uint8_t {
    * 16 element block is loaded, rather than 4 elements along one dimension.
    */
 
+  // "vector" packed layouts - single level of packing (4 elements along packed
+  // dim per int32)
+  TENSOR_PACKED_INT8_4W = 5u,
+  TENSOR_PACKED_INT8_4C = 6u,
+  TENSOR_PACKED_INT8_4H = 7u,
+
+  // Block packed layouts - two levels of packing (4x4 block composed of
+  // elements from two packed dims per ivec4)
   TENSOR_PACKED_INT8_4W4C = 3u,
   TENSOR_PACKED_INT8_4H4W = 4u,
 };
@@ -114,6 +122,15 @@ static constexpr GPUMemoryLayout kHeightPacked =
 static constexpr GPUMemoryLayout kChannelsPacked =
     GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
 
+static constexpr GPUMemoryLayout kPackedInt8_4W =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4W;
+
+static constexpr GPUMemoryLayout kPackedInt8_4C =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4C;
+
+static constexpr GPUMemoryLayout kPackedInt8_4H =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4H;
+
 static constexpr GPUMemoryLayout kPackedInt8_4W4C =
     GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C;
 
@@ -129,6 +146,12 @@ T to_packed_dim(const GPUMemoryLayout layout) {
       return 1;
     case kChannelsPacked:
       return 2;
+    case kPackedInt8_4W:
+      return 0;
+    case kPackedInt8_4C:
+      return 2;
+    case kPackedInt8_4H:
+      return 1;
     case kPackedInt8_4W4C:
       return 2;
     case kPackedInt8_4H4W:
@@ -170,6 +193,15 @@ inline std::ostream& operator<<(
     case kChannelsPacked:
       os << "TENSOR_CHANNELS_PACKED";
       break;
+    case kPackedInt8_4W:
+      os << "TENSOR_PACKED_INT8_4W";
+      break;
+    case kPackedInt8_4C:
+      os << "TENSOR_PACKED_INT8_4C";
+      break;
+    case kPackedInt8_4H:
+      os << "TENSOR_PACKED_INT8_4H";
+      break;
     case kPackedInt8_4W4C:
       os << "TENSOR_PACKED_INT8_4W4C";
       break;
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 03619ec54af..024c3a086a8 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -112,6 +112,70 @@ TEST_F(VulkanComputeAPITest, print_shader_executable_properties) {
 
 #endif // VK_KHR_pipeline_executable_properties && ETVK_INSPECT_PIPELINES
 
+std::vector<int64_t> get_reference_dim_order(
+    const size_t ndim,
+    const int32_t packed_dim) {
+  // Special case for zero dim tensors
+  if (ndim == 0) {
+    return {0};
+  }
+  std::vector<int64_t> dim_order(ndim);
+  // Explicitly convert ndim to signed to prevent underflow
+  int64_t last_dim = int64_t(ndim) - 1 - packed_dim;
+
+  int64_t cur_dim = 0;
+  for (int d = 0; d < ndim; ++d) {
+    if (d == last_dim) {
+      cur_dim++;
+    }
+    dim_order[d] = cur_dim;
+    cur_dim++;
+  }
+  if (last_dim >= 0) {
+    dim_order[ndim - 1] = last_dim;
+  }
+
+  return dim_order;
+}
+
+std::vector<int64_t> get_reference_padded_sizes(
+    const std::vector<int64_t>& sizes,
+    const int32_t packed_dim,
+    const bool packed_dim_padded,
+    const int32_t outer_packed_dim = -1,
+    const bool outer_packed_dim_padded = false) {
+  int64_t ndim = sizes.size();
+  if (ndim == 0) {
+    ndim = 1;
+  }
+
+  // Tensor sizes will be unsqueezed up to the next multiple of 4
+  const int64_t ndim_up4 = utils::align_up_4(ndim);
+  std::vector<int64_t> padded_sizes(ndim_up4);
+  for (int64_t i = 0; i < ndim_up4; ++i) {
+    padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes);
+  }
+
+  // Pad the packed dim to the next multiple of 4 if specified
+  if (packed_dim_padded) {
+    const int64_t dim_offset = packed_dim + 1;
+    const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
+    padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
+  }
+
+  // For block-packed layouts, also pad the outer packed dimension if specified
+  if (outer_packed_dim >= 0 && outer_packed_dim != packed_dim &&
+      outer_packed_dim_padded) {
+    const int64_t outer_dim_offset = outer_packed_dim + 1;
+    const int64_t outer_padded_dim_size =
+        utils::val_at(-outer_dim_offset, sizes);
+    padded_sizes.at(ndim_up4 - outer_dim_offset) =
+        utils::align_up_4(outer_padded_dim_size);
+  }
+
+  return padded_sizes;
+}
+
 std::vector<int64_t> get_reference_strides(
     const std::vector<int64_t>& sizes,
     const utils::GPUMemoryLayout layout,
@@ -194,39 +258,100 @@ std::vector<int64_t> get_reference_strides(
   return {};
 }
 
-/*
- * Applies the following transformations to a tensor's dim_order vector:
- *   1. Reverse the order of elements so that the fastest moving dimensions are
- *      first.
- *   2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the
- *      width dimension, 1 represents the height dimension, and 2 represents the
- *      channels dimension.
- *   3. Unsqueeze the dim_order vector to the next multiple of 4.
- */
-std::vector<int64_t> create_whcn_dim_order(
-    const std::vector<int64_t>& dim_order) {
-  size_t ndim = dim_order.size();
-  std::vector<int64_t> whcn_order(ndim);
+int64_t get_reference_physical_numel(
+    const vkapi::ScalarType dtype,
+    const std::vector<int64_t>& padded_sizes) {
+  size_t numel = utils::multiply_integers(padded_sizes);
 
-  // Convert from NCHW to WHCN index, and flip the dim order so that the fastest
-  // moving dimension is first.
-  // example: {     1,     2,        0} -> {       2,     0,      1}
-  //          {height, width, channels} -> {channels, width, height}
-  for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim;
-       ++whcn_i, --nchw_i) {
-    whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i);
+  // For kInt8x4, the data buffer is interpreted as an array of int32, where
+  // each int32 contains 4xint8 values. To account for this, the number of
+  // elements needs to be divided by 4.
+  if (dtype == vkapi::kInt8x4) {
+    // Should already be a multiple of 4 due to padding
+    if (numel % 4 != 0) {
+      VK_THROW("Expected numel to be multiple of 4 for kInt8x4");
+    }
+    numel /= 4;
   }
 
-  // Unsqueeze to the next multiple of 4
-  size_t ndim_up4 = utils::align_up_4(ndim);
-  whcn_order.resize(ndim_up4);
+  // For 8-bit types, align to the next multiple of 4. For devices that do not
+  // support 8-bit storage buffers, the tensor data will be interpreted as an
+  // array of int32 instead.
+  if (vkapi::element_size(dtype) == 1) {
+    numel = utils::align_up_4(numel);
+  }
+  return numel;
+}
+
+utils::uvec3 get_reference_image_extents(
+    const vkapi::ScalarType dtype,
+    const int32_t packed_dim,
+    const int32_t outer_packed_dim,
+    const bool is_block_packed,
+    const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& axis_map) {
+  utils::uvec3 extents({1, 1, 1});
 
-  // Append unsqueezed dimensions
-  for (size_t i = ndim; i < ndim_up4; ++i) {
-    whcn_order.at(i) = i;
+  const int64_t packed_dim_axis = axis_map.at(packed_dim);
+  const int64_t outer_packed_dim_axis = axis_map.at(outer_packed_dim);
+
+  // If the packed dim is not padded to the next multiple of 4, then that means
+  // this tensor is using buffer storage and does not require texture extents.
+  const int64_t packed_dim_idx = padded_sizes.size() - 1 - packed_dim;
+  if (padded_sizes.at(packed_dim_idx) % 4 != 0) {
+    return extents;
+  }
+
+  // For high dimensional tensors, buffer storage must be used. No need to
+  // compute image extents in this case.
+  if (padded_sizes.size() > 4) {
+    return extents;
+  }
+
+  // First three elements of axis_map indicate which (X,Y,Z) image axis the
+  // width, height, and channels dim of the tensor maps to.
+  for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) {
+    const int64_t axis = axis_map.at(whcn_dim);
+    const int64_t dim = padded_sizes.size() - 1 - whcn_dim;
+    extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
+  }
+
+  // For "regular" tensor dtypes, 4 elements along the packed dim are packed
+  // into one texel (4-component vectorized type). However, for kInt8x4 dtype,
+  // an additional level of packing is employed where 4 int8 elements are
+  // packed into one int32, and then 4 int32 are packed into each ivec4 texel.
+  if (dtype == vkapi::kInt8x4) {
+    // For layouts with only one packed dimension, loading an ivec4 texel from
+    // the texture loads 16 int8 values (4 int32 that each contain 4 int8).
+    if (!is_block_packed) {
+      extents[packed_dim_axis] = utils::div_up(extents[packed_dim_axis], 16u);
+    }
+    // Layouts with two packed dimension (e.g., 4W4C, 4H4W) load a 4x4 block of
+    // data from two dimensions with each ivec4 texel load, as opposed to 16
+    // adjacent values from a single dimension.
+    else {
+      if (extents[outer_packed_dim_axis] % 4 != 0) {
+        VK_THROW("Expected outer_packed_dim_axis extent to be multiple of 4");
+      }
+      extents[outer_packed_dim_axis] /= 4;
+      if (extents[packed_dim_axis] % 4 != 0) {
+        VK_THROW("Expected packed_dim_axis extent to be multiple of 4");
+      }
+      extents[packed_dim_axis] /= 4;
+    }
+  } else {
+    extents[packed_dim_axis] /= 4;
   }
 
-  return whcn_order;
+  // axis_map[3] indicates the WHCN index of the dimension used for batch
+  // concatenation. Thus a double lookup is required to determine the image axis
+  // used for batch concatenation.
+  const int64_t concatted_whcn_dim = axis_map.at(3);
+  const int64_t batch_axis = axis_map.at(concatted_whcn_dim);
+  // Multiply the extents of the batch axis by the batch size.
+  extents[batch_axis] *= padded_sizes.at(0);
+
+  return extents;
 }
 
 TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
@@ -250,89 +375,405 @@ bool compare_vectors(
   return true;
 }
 
-TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
-  // ndim, GPUMemoryLayout, expected dim order pairs
-  std::vector<std::tuple<size_t, int32_t, std::vector<int64_t>>> test_cases = {
-      {1, WHCN::kWidthDim, {0}},
-      {1, WHCN::kHeightDim, {0}},
-      {1, WHCN::kChannelsDim, {0}},
-      {2, WHCN::kWidthDim, {0, 1}},
-      {2, WHCN::kHeightDim, {1, 0}},
-      {2, WHCN::kChannelsDim, {0, 1}},
-      {3, WHCN::kWidthDim, {0, 1, 2}},
-      {3, WHCN::kHeightDim, {0, 2, 1}},
-      {3, WHCN::kChannelsDim, {1, 2, 0}},
-      {4, WHCN::kWidthDim, {0, 1, 2, 3}},
-      {4, WHCN::kHeightDim, {0, 1, 3, 2}},
-      {4, WHCN::kChannelsDim, {0, 2, 3, 1}},
-  };
+TEST_F(VulkanComputeAPITest, tensor_layout_metadata_test) {
+  // Test all combinations of tensor sizes, storage types, and memory layouts
+  // to ensure that layout metadata is computed correctly
 
-  for (const auto& test_case : test_cases) {
-    const size_t& ndim = std::get<0>(test_case);
-    const int32_t packed_dim = std::get<1>(test_case);
-    const auto& expected_dim_order = std::get<2>(test_case);
-    std::vector<int64_t> dim_order = calculate_dim_order(ndim, packed_dim);
+  // Define test configuration for each layout type
+  struct LayoutTestConfig {
+    utils::GPUMemoryLayout layout;
+    vkapi::ScalarType dtype;
+    int32_t packed_dim;
+    int32_t outer_packed_dim;
+    bool is_block_packed;
+  };
 
-    ASSERT_TRUE(dim_order == expected_dim_order);
-  }
-}
+  std::vector<LayoutTestConfig> layout_configs = {
+      // Standard layouts with float dtype
+      {utils::kWidthPacked,
+       vkapi::kFloat,
+       WHCN::kWidthDim,
+       WHCN::kWidthDim,
+       false},
+      {utils::kHeightPacked,
+       vkapi::kFloat,
+       WHCN::kHeightDim,
+       WHCN::kHeightDim,
+       false},
+      {utils::kChannelsPacked,
+       vkapi::kFloat,
+       WHCN::kChannelsDim,
+       WHCN::kChannelsDim,
+       false},
+
+      // Packed int8 vector layouts (single-dimension packed)
+      // Use kChar, which should be converted to kInt8x4
+      {utils::kPackedInt8_4W,
+       vkapi::kChar,
+       WHCN::kWidthDim,
+       WHCN::kWidthDim,
+       false},
+      {utils::kPackedInt8_4C,
+       vkapi::kChar,
+       WHCN::kChannelsDim,
+       WHCN::kChannelsDim,
+       false},
+      {utils::kPackedInt8_4H,
+       vkapi::kChar,
+       WHCN::kHeightDim,
+       WHCN::kHeightDim,
+       false},
+
+      // Packed int8 block layouts (two-dimension packed)
+      // Use kChar, which should be converted to kInt8x4
+      {utils::kPackedInt8_4W4C,
+       vkapi::kChar,
+       WHCN::kChannelsDim,
+       WHCN::kWidthDim,
+       true},
+      {utils::kPackedInt8_4H4W,
+       vkapi::kChar,
+       WHCN::kWidthDim,
+       WHCN::kHeightDim,
+       true},
+  };
 
-TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
-  vTensor v_tensor_to_resize(
-      context(),
-      {25, 25, 25, 25},
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked,
-      /*allocate_memory = */ false);
+  std::vector<utils::StorageType> storage_types = {
+      utils::kBuffer, utils::kTexture3D};
 
   for (const auto& sizes : standard_sizes_to_test) {
-    if (sizes.size() < 3) {
-      continue;
+    if (sizes.size() < 2) {
+      continue; // Skip 1D tensors
     }
-    for (const auto& layout :
-         {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
-      {
-        const int32_t packed_dim = static_cast<int32_t>(layout);
-        std::vector<int64_t> dim_order =
-            calculate_dim_order(sizes.size(), packed_dim);
-        std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
-        int64_t numel = utils::multiply_integers(sizes);
 
-        std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
-        ASSERT_TRUE(strides == ref_strides);
+    for (const auto& storage_type : storage_types) {
+      for (const auto& config : layout_configs) {
+        // Skip block-packed layouts for tensors with less than 3 dimensions
+        if (config.is_block_packed && sizes.size() < 3) {
+          continue;
+        }
+
+        // Create tensor
+        vTensor tensor(
+            context(),
+            sizes,
+            config.dtype,
+            storage_type,
+            config.layout,
+            /*allocate_memory = */ false);
 
-        std::vector<int64_t> unsqueezed_strides =
-            flip_and_unsqueeze<int64_t>(strides, kTensorStrides, numel);
+        // Verify sizes
+        ASSERT_TRUE(tensor.sizes() == sizes)
+            << "Sizes mismatch for layout=" << static_cast<int>(config.layout)
+            << ", storage=" << static_cast<int>(storage_type);
 
-        std::vector<int64_t> ref_unsqueezed_strides =
-            get_reference_strides(sizes, layout, true);
+        // Verify dtype
+        // For packed int8 layouts, kChar should be converted to kInt8x4
+        vkapi::ScalarType expected_dtype = config.dtype;
+        if (config.dtype == vkapi::kChar) {
+          expected_dtype = vkapi::kInt8x4;
+        }
+        ASSERT_EQ(tensor.dtype(), expected_dtype)
+            << "Dtype mismatch for layout=" << static_cast<int>(config.layout)
+            << ", expected=" << static_cast<int>(expected_dtype)
+            << ", got=" << static_cast<int>(tensor.dtype());
+
+        // Determine if packed_dim should be padded
+        // For packed int8 layouts (using kChar which converts to kInt8x4),
+        // always padded For texture storage, always padded For buffer storage
+        // with standard layouts, not padded
+        const bool expected_packed_dim_padded =
+            (config.dtype == vkapi::kChar) || (storage_type != utils::kBuffer);
+
+        // For block-packed layouts, outer_packed_dim is also padded
+        const bool expected_outer_packed_dim_padded = config.is_block_packed;
+
+        // Verify packed_dim_info
+        const auto& packed_dim_info = tensor.packed_dim_info();
+        ASSERT_EQ(packed_dim_info.packed_dim, config.packed_dim)
+            << "packed_dim mismatch for layout="
+            << static_cast<int>(config.layout);
+        ASSERT_EQ(packed_dim_info.packed_dim_padded, expected_packed_dim_padded)
+            << "packed_dim_padded mismatch for layout="
+            << static_cast<int>(config.layout);
+        ASSERT_EQ(packed_dim_info.outer_packed_dim, config.outer_packed_dim)
+            << "outer_packed_dim mismatch for layout="
+            << static_cast<int>(config.layout);
+        ASSERT_EQ(
+            packed_dim_info.outer_packed_dim_padded,
+            expected_outer_packed_dim_padded)
+            << "outer_packed_dim_padded mismatch for layout="
+            << static_cast<int>(config.layout);
+        ASSERT_EQ(packed_dim_info.is_block_packed, config.is_block_packed)
+            << "is_block_packed mismatch for layout="
+            << static_cast<int>(config.layout);
+
+        // Verify dim_order
+        std::vector<int64_t> ref_dim_order =
+            get_reference_dim_order(sizes.size(), config.packed_dim);
+        ASSERT_TRUE(tensor.dim_order() == ref_dim_order)
+            << "Dim order mismatch for layout="
+            << static_cast<int>(config.layout);
+
+        // Verify padded_sizes
+        std::vector<int64_t> ref_padded_sizes = get_reference_padded_sizes(
+            sizes,
+            config.packed_dim,
+            expected_packed_dim_padded,
+            config.outer_packed_dim,
+            expected_outer_packed_dim_padded);
+        ASSERT_TRUE(tensor.padded_sizes() == ref_padded_sizes)
+            << "Padded sizes mismatch for layout="
+            << static_cast<int>(config.layout);
+
+        if (storage_type == utils::kBuffer) {
+          // For buffer tensors, verify strides (only for standard layouts)
+          // For int8 layouts, we rely on padded_sizes and dim_order
+          // verification
+          if (config.dtype == vkapi::kFloat) {
+            std::vector<int64_t> ref_strides =
+                get_reference_strides(sizes, config.layout);
+            ASSERT_TRUE(tensor.strides() == ref_strides)
+                << "Strides mismatch for layout="
+                << static_cast<int>(config.layout);
+
+            // Also test flip_and_unsqueeze operations
+            int64_t numel = utils::multiply_integers(sizes);
+            std::vector<int64_t> unsqueezed_strides =
+                flip_and_unsqueeze<int64_t>(
+                    tensor.strides(), kTensorStrides, numel);
+            std::vector<int64_t> ref_unsqueezed_strides =
+                get_reference_strides(sizes, config.layout, true);
+            ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
+          }
+
+          // Verify physical_numel for buffer storage
+          int64_t ref_physical_numel =
+              get_reference_physical_numel(expected_dtype, ref_padded_sizes);
+          ASSERT_EQ(tensor.physical_numel(), ref_physical_numel)
+              << "Physical numel mismatch for buffer storage with layout="
+              << static_cast<int>(config.layout);
+        } else {
+          // For texture tensors, verify axis_map
+          std::vector<int64_t> expected_axis_map = {0, 1, 2, 2};
+          ASSERT_TRUE(tensor.axis_map() == expected_axis_map)
+              << "Axis map mismatch for texture tensor with layout="
+              << static_cast<int>(config.layout);
+          ASSERT_TRUE(tensor.has_standard_axis_map());
+
+          // Verify image_extents for texture storage
+          utils::uvec3 ref_image_extents = get_reference_image_extents(
+              expected_dtype,
+              config.packed_dim,
+              config.outer_packed_dim,
+              config.is_block_packed,
+              ref_padded_sizes,
+              expected_axis_map);
+          ASSERT_EQ(tensor.image_extents(), ref_image_extents)
+              << "Image extents mismatch for texture storage with layout="
+              << static_cast<int>(config.layout);
+        }
+      }
+    }
+  }
+}
 
-        ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
+TEST_F(VulkanComputeAPITest, tensor_layout_metadata_test_against_golden) {
+  // Test with hardcoded golden values for specific test cases.
+  // This complements the reference implementation test by providing concrete
+  // examples with known-good values.
+
+  struct TestCase {
+    std::vector<int64_t> sizes;
+    vkapi::ScalarType dtype;
+    utils::GPUMemoryLayout layout;
+    // Expected values for both buffer and texture storage
+    std::vector<int64_t> expected_dim_order;
+    std::vector<int64_t> expected_padded_sizes_buffer;
+    std::vector<int64_t> expected_padded_sizes_texture;
+    std::vector<int64_t> expected_strides_buffer;
+    int64_t expected_physical_numel_buffer;
+    int64_t expected_physical_numel_texture;
+    utils::uvec3 expected_image_extents;
+  };
 
-        std::vector<int64_t> whcn_dim_order =
-            flip_and_unsqueeze<int64_t>(dim_order, kTensorDimOrder, numel);
+  std::vector<TestCase> test_cases = {
+      // 2D tensor [5, 7] with width packed, float dtype
+      {/* sizes */ {5, 7},
+       /* dtype */ vkapi::kFloat,
+       /* layout */ utils::kWidthPacked,
+       /* expected_dim_order */ {0, 1},
+       /* expected_padded_sizes_buffer */ {1, 1, 5, 7},
+       /* expected_padded_sizes_texture */ {1, 1, 5, 8},
+       /* expected_strides_buffer */ {7, 1},
+       /* expected_physical_numel_buffer */ 35,
+       /* expected_physical_numel_texture */ 40,
+       /* expected_image_extents */ {2, 5, 1}},
+
+      // 3D tensor [3, 5, 7] with channels packed, float dtype
+      {/* sizes */ {3, 5, 7},
+       /* dtype */ vkapi::kFloat,
+       /* layout */ utils::kChannelsPacked,
+       /* expected_dim_order */ {1, 2, 0},
+       /* expected_padded_sizes_buffer */ {1, 3, 5, 7},
+       /* expected_padded_sizes_texture */ {1, 4, 5, 7},
+       /* expected_strides_buffer */ {1, 7 * 3, 3},
+       /* expected_physical_numel_buffer */ 105,
+       /* expected_physical_numel_texture */ 140,
+       /* expected_image_extents */ {7, 5, 1}},
+
+      // 4D tensor [2, 3, 5, 7] with height packed, float dtype
+      {/* sizes */ {2, 3, 5, 7},
+       /* dtype */ vkapi::kFloat,
+       /* layout */ utils::kHeightPacked,
+       /* expected_dim_order */ {0, 1, 3, 2},
+       /* expected_padded_sizes_buffer */ {2, 3, 5, 7},
+       /* expected_padded_sizes_texture */ {2, 3, 8, 7},
+       /* expected_strides_buffer */ {3 * 5 * 7, 5 * 7, 1, 5},
+       /* expected_physical_numel_buffer */ 210,
+       /* expected_physical_numel_texture */ 336,
+       /* expected_image_extents */ {7, 2, 6}},
+
+      // 3D tensor [8, 12, 16] with packed int8 4W layout
+      {/* sizes */ {8, 12, 16},
+       /* dtype */ vkapi::kChar,
+       /* layout */ utils::kPackedInt8_4W,
+       /* expected_dim_order */ {0, 1, 2},
+       /* expected_padded_sizes_buffer */ {1, 8, 12, 16},
+       /* expected_padded_sizes_texture */ {1, 8, 12, 16},
+       /* expected_strides_buffer */ {},
+       /* expected_physical_numel_buffer */ 384,
+       /* expected_physical_numel_texture */ 384,
+       /* expected_image_extents */ {1, 12, 8}},
+
+      // 3D tensor [8, 12, 16] with packed int8 4W4C block layout
+      {/* sizes */ {8, 12, 16},
+       /* dtype */ vkapi::kChar,
+       /* layout */ utils::kPackedInt8_4W4C,
+       /* expected_dim_order */ {1, 2, 0},
+       /* expected_padded_sizes_buffer */ {1, 8, 12, 16},
+       /* expected_padded_sizes_texture */ {1, 8, 12, 16},
+       /* expected_strides_buffer */ {},
+       /* expected_physical_numel_buffer */ 384,
+       /* expected_physical_numel_texture */ 384,
+       /* expected_image_extents */ {4, 12, 2}},
+
+      // 3D tensor [9, 13, 17] with packed int8 4C layout (odd sizes)
+      {/* sizes */ {9, 13, 17},
+       /* dtype */ vkapi::kChar,
+       /* layout */ utils::kPackedInt8_4C,
+       /* expected_dim_order */ {1, 2, 0},
+       /* expected_padded_sizes_buffer */ {1, 12, 13, 17},
+       /* expected_padded_sizes_texture */ {1, 12, 13, 17},
+       /* expected_strides_buffer */ {},
+       /* expected_physical_numel_buffer */ 663,
+       /* expected_physical_numel_texture */ 663,
+       /* expected_image_extents */ {17, 13, 1}},
+
+      // 3D tensor [9, 13, 17] with packed int8 4H4W block layout (odd sizes)
+      {/* sizes */ {9, 13, 17},
+       /* dtype */ vkapi::kChar,
+       /* layout */ utils::kPackedInt8_4H4W,
+       /* expected_dim_order */ {0, 1, 2},
+       /* expected_padded_sizes_buffer */ {1, 9, 16, 20},
+       /* expected_padded_sizes_texture */ {1, 9, 16, 20},
+       /* expected_strides_buffer */ {},
+       /* expected_physical_numel_buffer */ 720,
+       /* expected_physical_numel_texture */ 720,
+       /* expected_image_extents */ {5, 4, 9}},
+  };
 
-        std::vector<int64_t> ref_whcn_dim_order =
-            create_whcn_dim_order(dim_order);
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    const auto& tc = test_cases[i];
 
-        ASSERT_TRUE(whcn_dim_order == ref_whcn_dim_order);
+    // Test with buffer storage
+    {
+      vTensor tensor_buffer(
+          context(),
+          tc.sizes,
+          tc.dtype,
+          utils::kBuffer,
+          tc.layout,
+          /*allocate_memory = */ false);
+
+      // Verify dtype (kChar -> kInt8x4)
+      vkapi::ScalarType expected_dtype = tc.dtype;
+      if (tc.dtype == vkapi::kChar) {
+        expected_dtype = vkapi::kInt8x4;
+      }
+      ASSERT_EQ(tensor_buffer.dtype(), expected_dtype)
+          << "Test case " << i << ": Buffer dtype mismatch";
+
+      // Verify dim_order
+      ASSERT_TRUE(tensor_buffer.dim_order() == tc.expected_dim_order)
+          << "Test case " << i << ": Buffer dim_order mismatch"
+          << " (expected size: " << tc.expected_dim_order.size()
+          << ", actual size: " << tensor_buffer.dim_order().size() << ")";
+
+      // Verify padded_sizes
+      ASSERT_TRUE(
+          tensor_buffer.padded_sizes() == tc.expected_padded_sizes_buffer)
+          << "Test case " << i << ": Buffer padded_sizes mismatch";
+
+      // Verify strides (only for float dtype)
+      if (tc.dtype == vkapi::kFloat && !tc.expected_strides_buffer.empty()) {
+        ASSERT_TRUE(tensor_buffer.strides() == tc.expected_strides_buffer)
+            << "Test case " << i << ": Buffer strides mismatch";
+      }
 
-        // Create new vTensor and check that the strides are correct
-        vTensor new_v_tensor(
-            context(),
-            sizes,
-            vkapi::kFloat,
-            utils::kBuffer,
-            layout,
-            /*allocate_memory = */ false);
+      // Verify physical_numel
+      ASSERT_EQ(
+          tensor_buffer.physical_numel(), tc.expected_physical_numel_buffer)
+          << "Test case " << i << ": Buffer physical_numel mismatch";
+    }
 
-        ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
+    // Test with texture storage
+    {
+      vTensor tensor_texture(
+          context(),
+          tc.sizes,
+          tc.dtype,
+          utils::kTexture3D,
+          tc.layout,
+          /*allocate_memory = */ false);
 
-        // Resize vtensor and check that updated metadata is correct
-        v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
-        ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
+      // Verify dtype (kChar -> kInt8x4)
+      vkapi::ScalarType expected_dtype = tc.dtype;
+      if (tc.dtype == vkapi::kChar) {
+        expected_dtype = vkapi::kInt8x4;
       }
+      ASSERT_EQ(tensor_texture.dtype(), expected_dtype)
+          << "Test case " << i << ": Texture dtype mismatch";
+
+      // Verify dim_order (texture doesn't use dim_order, but it's still
+      // computed)
+      ASSERT_TRUE(tensor_texture.dim_order() == tc.expected_dim_order)
+          << "Test case " << i << ": Texture dim_order mismatch";
+
+      // Verify padded_sizes
+      ASSERT_TRUE(
+          tensor_texture.padded_sizes() == tc.expected_padded_sizes_texture)
+          << "Test case " << i << ": Texture padded_sizes mismatch";
+
+      // Verify axis_map
+      std::vector<int64_t> expected_axis_map = {0, 1, 2, 2};
+      ASSERT_TRUE(tensor_texture.axis_map() == expected_axis_map)
+          << "Test case " << i << ": Texture axis_map mismatch";
+
+      // Verify physical_numel
+      ASSERT_EQ(
+          tensor_texture.physical_numel(), tc.expected_physical_numel_texture)
+          << "Test case " << i << ": Texture physical_numel mismatch";
+
+      // Verify image_extents
+      ASSERT_EQ(tensor_texture.image_extents(), tc.expected_image_extents)
+          << "Test case " << i << ": Texture image_extents mismatch"
+          << " (expected: [" << tc.expected_image_extents[0] << ", "
+          << tc.expected_image_extents[1] << ", "
+          << tc.expected_image_extents[2] << "], got: ["
+          << tensor_texture.image_extents()[0] << ", "
+          << tensor_texture.image_extents()[1] << ", "
+          << tensor_texture.image_extents()[2] << "])";
     }
   }
 }