pytorch · meta-codesync · Dec 30, 2025 · Dec 27, 2025 · Dec 29, 2025 · Dec 29, 2025
@@ -22,48 +22,40 @@ namespace api {
 static constexpr size_t kTensorDimLimit = 8;
 
 /*
- * Given a GPUMemoryLayout value, produce a dim order vector that matches the
- * given memory layout. The produced dim order vector will be in the NCHW
- * dimension order
+ * PackedDimInfo encapsulates metadata about packed dimensions in GPU tensors.
+ * This includes information about which dimension is packed, whether it's
+ * padded, and block packing information for special layouts like 4W4C and 4H4W.
  */
-std::vector<int64_t> calculate_dim_order(
-    const size_t ndim,
-    const int32_t packed_dim);
-
-/*
- * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
- * dimension order, calculate the strides of the tensor.
- */
-std::vector<int64_t> calculate_strides(
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& dim_order);
-
-/*
- * When stored on the GPU, tensor data is stored using texels (i.e. a vector of
- * 4 scalar values) in order to take advantage of the GPU's native vectorization
- * capabilities. Furthermore, tensor metadata is passed in to shaders as ivec4
- * types.
- *
- * To accommodate these vectorized types, the sizes of a tensor will be modified
- * for GPU storage in the following ways:
- *
- *   1. The dimensionality of the tensor will be padded to a multiple of 4.
- *   2. The size of the packed dimension will be padded to a multiple of 4.
- *
- * The "packed dimension" is determined based on the utils::GPUMemoryLayout
- * argument.
- */
-std::vector<int64_t> calculate_padded_sizes(
-    const std::vector<int64_t>& sizes,
-    const int32_t packed_dim);
-
-/*
- * Calculate the image extents required of a texture backed tensor.
- */
-utils::uvec3 calculate_image_extents(
-    const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim);
+struct PackedDimInfo {
+  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
+  // width, 1 for height, etc.). For texture backed tensors, this describes
+  // which dimension is packed along a texel. For buffer backed tensors, this
+  // describes which dimension has a stride of 1 (i.e. is last in the dim
+  // order).
+  int32_t packed_dim;
+  // Describes if the packed dimension is padded to a multiple of 4. This will
+  // be true for all tensors that use texture storage, and will also be true
+  // for the PACKED_PADDED memory layouts.
+  bool packed_dim_padded;
+  // Describes a second level of packing, if applicable (which will only apply
+  // to the 4W4C and 4H4W layouts). If there is no second level of packing,
+  // then this will be equal to packed_dim. Otherwise, it will represent the
+  // outer dim used to construct block packing. For example, 4W4C will have
+  // packed_dim = 2 and outer_packed_dim = 0.
+  int32_t outer_packed_dim;
+  // Whether the outer packed dim is padded to the next multiple of 4. This is
+  // true only for block-packed layouts.
+  bool outer_packed_dim_padded;
+  // True if this layout uses block packing (i.e., outer_packed_dim !=
+  // packed_dim). Block packing is used for layouts like 4W4C and 4H4W.
+  bool is_block_packed;
+
+  PackedDimInfo(
+      int32_t dim,
+      bool dim_padded,
+      int32_t outer_dim,
+      bool outer_dim_padded);
+};
 
 struct LastAccess {
   vkapi::PipelineStageFlags stage;
@@ -79,18 +71,6 @@ struct LastAccess {
       : stage{stage_flags}, access{access_flags} {}
 };
 
-/*
- * Calculate the number of elements that a GPU buffer would require to store the
- * contents of a tensor. This will depend on the storage type and dtype of the
- * tensor, as well as the features available on the device.
- */
-int64_t calculate_gpu_buffer_numel(
-    Context* const context,
-    const std::vector<int64_t>& sizes,
-    const utils::uvec3 image_extents,
-    const utils::StorageType storage_type,
-    const vkapi::ScalarType dtype);
-
 class vTensorStorage final {
  public:
   // Do not allow empty vTensorStorage construction
@@ -99,11 +79,11 @@ class vTensorStorage final {
   vTensorStorage(
       Context* context,
       const utils::StorageType storage_type,
-      const utils::GPUMemoryLayout memory_layout,
       const std::vector<int64_t>& axis_map,
-      const int32_t packed_dim,
-      const std::vector<int64_t>& sizes,
+      const PackedDimInfo& packed_dim_info,
+      const std::vector<int64_t>& padded_sizes,
       const vkapi::ScalarType dtype,
+      const int64_t physical_numel,
       const bool allocate_memory = true);
 
   vTensorStorage(Context* const context, const vkapi::VulkanImage& image);
@@ -295,13 +275,13 @@ class vTensor final {
         const std::vector<int64_t>& sizes,
         const TextureLimits& logical_limits,
         const std::vector<int64_t>& axis_map,
-        const int32_t packed_dim);
+        const PackedDimInfo& packed_dim_info);
 
     void update(
         const std::vector<int64_t>& sizes,
         const TextureLimits& logical_limits,
         const std::vector<int64_t>& axis_map,
-        const int32_t packed_dim);
+        const PackedDimInfo& packed_dim_info);
   };
 
  private:
@@ -310,16 +290,14 @@ class vTensor final {
    * to construct a tensor.
    */
 
+  // Information about packed dimension padding and block packing
+  PackedDimInfo packed_dim_info_;
   // Whether the tensor has elements of type float, int, etc.
   vkapi::ScalarType dtype_;
   // sizes of the tensor in NCHW dimension order
   std::vector<int64_t> sizes_;
-  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
-  // width, 1 for height, etc.). For texture backed tensors, this describes
-  // which dimension is packed along a texel. For buffer backed tensors, this
-  // describes which dimension has a stride of 1 (i.e. is last in the dim
-  // order).
-  int32_t packed_dim_;
+  // padded sizes of the tensor (pre-computed to avoid recalculation)
+  std::vector<int64_t> padded_sizes_;
 
   /*
    * "Layout" metadata. These describe with further detail how tensor data is
@@ -353,6 +331,10 @@ class vTensor final {
   // number of elements based on the canonical sizes
   size_t numel_;
 
+  // number of elements required for GPU buffer storage (with padding/packing)
+  // This is pre-computed to avoid recomputing calculate_gpu_buffer_numel
+  int64_t physical_numel_;
+
   // For texture backed tensors, this int32 contains the axis map data packed
   // into a single int32. For buffer backed tensors, this int32 contains the
   // wchn dim order data packed into a single int32.
@@ -483,7 +465,11 @@ class vTensor final {
   utils::GPUMemoryLayout estimate_memory_layout() const;
 
   inline int32_t packed_dim() const {
-    return packed_dim_;
+    return packed_dim_info_.packed_dim;
+  }
+
+  inline const PackedDimInfo& packed_dim_info() const {
+    return packed_dim_info_;
   }
 
   /*
@@ -514,10 +500,22 @@ class vTensor final {
     return strides_;
   }
 
+  inline const std::vector<int64_t>& padded_sizes() const {
+    return padded_sizes_;
+  }
+
   inline size_t numel() const {
     return numel_;
   }
 
+  inline int64_t physical_numel() const {
+    return physical_numel_;
+  }
+
+  inline utils::uvec3 image_extents() const {
+    return storage_->image_extents_;
+  }
+
   inline size_t nbytes() const {
     return element_size(dtype()) * numel();
   }

@@ -362,6 +362,10 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().staging_buffer_numel();
   }
 
+  inline int64_t physical_numel_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().physical_numel();
+  }
+
   inline utils::StorageType storage_type_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().storage_type();
   }
@@ -442,6 +446,11 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().packed_dim();
   }
 
+  inline const api::PackedDimInfo& packed_dim_info_of(
+      const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().packed_dim_info();
+  }
+
   inline int32_t concat_dim_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().concat_dim();
   }

@@ -108,17 +108,19 @@ utils::uvec3 concat_pick_global_wg_size(
   // Calculate what the image extents would be of a tensor with the input
   // volume's sizes. This produces the number of texels that would need to be
   // written to.
-  const int32_t packed_dim = graph->packed_dim_of(out);
+
+  const int32_t packed_dim_idx = graph->packed_dim_of(out);
   std::vector<int64_t> inp_volume_texel_sizes =
-      api::calculate_padded_sizes(inp_volume_sizes, packed_dim);
+      api::flip_and_unsqueeze<int64_t>(inp_volume_sizes, api::kTensorSizes, 1);
+
   // If the concat_dim is the same as the packed dim, and the concat_offset for
   // this input batch is not a multiple of 4, then the data from an input texel
   // may be split up between two output texels. For example:
   //                I0 , I1 , I2 , I2
   // O0 , O1 , O2 , X  | X  , X  , X ,  X
   // Therefore, 1 texel is added to the packed dim to account for this.
-  inp_volume_texel_sizes.at(3 - packed_dim) =
-      utils::div_up_4(inp_volume_texel_sizes.at(3 - packed_dim)) + 1;
+  inp_volume_texel_sizes.at(packed_dim_idx) =
+      utils::div_up_4(inp_volume_texel_sizes.at(packed_dim_idx)) + 1;
 
   const uint32_t inp_volume_texel_numel =
       utils::multiply_integers(inp_volume_texel_sizes);
@@ -324,7 +326,7 @@ void add_concat_node(
           {1u, 1u, 1u},
           {1u, 1u, 1u},
           // Inputs and Outputs
-          {{concat_offset, vkapi::kWrite}},
+          {{concat_offset, vkapi::kReadWrite}},
           // Parameter buffers
           param_buffers,
           // Push Constants

@@ -34,7 +34,11 @@ void add_tan_node(ComputeGraph& graph, const ValueRef in, const ValueRef out) {
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
 
   vkapi::ParamsBindList ubos({});
-  ubos.append({graph.logical_limits_ubo(out)});
+  if (graph.storage_type_of(out) == utils::kBuffer) {
+    ubos.append({graph.numel_ubo(out)});
+  } else {
+    ubos.append({graph.logical_limits_ubo(out)});
+  }
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,

@@ -13,6 +13,9 @@ namespace utils {
 
 bool is_packed_int8_layout(const GPUMemoryLayout layout) {
   switch (layout) {
+    case kPackedInt8_4W:
+    case kPackedInt8_4C:
+    case kPackedInt8_4H:
     case kPackedInt8_4W4C:
     case kPackedInt8_4H4W:
       return true;

@@ -101,6 +101,14 @@ enum class GPUMemoryLayout : uint8_t {
    * 16 element block is loaded, rather than 4 elements along one dimension.
    */
 
+  // "vector" packed layouts - single level of packing (4 elements along packed
+  // dim per int32)
+  TENSOR_PACKED_INT8_4W = 5u,
+  TENSOR_PACKED_INT8_4C = 6u,
+  TENSOR_PACKED_INT8_4H = 7u,
+
+  // Block packed layouts - two levels of packing (4x4 block composed of
+  // elements from two packed dims per ivec4)
   TENSOR_PACKED_INT8_4W4C = 3u,
   TENSOR_PACKED_INT8_4H4W = 4u,
 };
@@ -114,6 +122,15 @@ static constexpr GPUMemoryLayout kHeightPacked =
 static constexpr GPUMemoryLayout kChannelsPacked =
     GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
 
+static constexpr GPUMemoryLayout kPackedInt8_4W =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4W;
+
+static constexpr GPUMemoryLayout kPackedInt8_4C =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4C;
+
+static constexpr GPUMemoryLayout kPackedInt8_4H =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4H;
+
 static constexpr GPUMemoryLayout kPackedInt8_4W4C =
     GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C;
 
@@ -129,6 +146,12 @@ T to_packed_dim(const GPUMemoryLayout layout) {
       return 1;
     case kChannelsPacked:
       return 2;
+    case kPackedInt8_4W:
+      return 0;
+    case kPackedInt8_4C:
+      return 2;
+    case kPackedInt8_4H:
+      return 1;
     case kPackedInt8_4W4C:
       return 2;
     case kPackedInt8_4H4W:
@@ -170,6 +193,15 @@ inline std::ostream& operator<<(
     case kChannelsPacked:
       os << "TENSOR_CHANNELS_PACKED";
       break;
+    case kPackedInt8_4W:
+      os << "TENSOR_PACKED_INT8_4W";
+      break;
+    case kPackedInt8_4C:
+      os << "TENSOR_PACKED_INT8_4C";
+      break;
+    case kPackedInt8_4H:
+      os << "TENSOR_PACKED_INT8_4H";
+      break;
     case kPackedInt8_4W4C:
       os << "TENSOR_PACKED_INT8_4W4C";
       break;