Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
446 changes: 280 additions & 166 deletions backends/vulkan/runtime/api/containers/Tensor.cpp

Large diffs are not rendered by default.

128 changes: 63 additions & 65 deletions backends/vulkan/runtime/api/containers/Tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,48 +22,40 @@ namespace api {
static constexpr size_t kTensorDimLimit = 8;

/*
* Given a GPUMemoryLayout value, produce a dim order vector that matches the
* given memory layout. The produced dim order vector will be in the NCHW
* dimension order
* PackedDimInfo encapsulates metadata about packed dimensions in GPU tensors.
* This includes information about which dimension is packed, whether it's
* padded, and block packing information for special layouts like 4W4C and 4H4W.
*/
std::vector<int64_t> calculate_dim_order(
const size_t ndim,
const int32_t packed_dim);

/*
* Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
* dimension order, calculate the strides of the tensor.
*/
std::vector<int64_t> calculate_strides(
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& dim_order);

/*
* When stored on the GPU, tensor data is stored using texels (i.e. a vector of
* 4 scalar values) in order to take advantage of the GPU's native vectorization
* capabilities. Furthermore, tensor metadata is passed in to shaders as ivec4
* types.
*
* To accommodate these vectorized types, the sizes of a tensor will be modified
* for GPU storage in the following ways:
*
* 1. The dimensionality of the tensor will be padded to a multiple of 4.
* 2. The size of the packed dimension will be padded to a multiple of 4.
*
* The "packed dimension" is determined based on the utils::GPUMemoryLayout
* argument.
*/
std::vector<int64_t> calculate_padded_sizes(
const std::vector<int64_t>& sizes,
const int32_t packed_dim);

/*
* Calculate the image extents required of a texture backed tensor.
*/
utils::uvec3 calculate_image_extents(
const std::vector<int64_t>& padded_sizes,
const std::vector<int64_t>& axis_map,
const int32_t packed_dim);
struct PackedDimInfo {
// Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
// width, 1 for height, etc.). For texture backed tensors, this describes
// which dimension is packed along a texel. For buffer backed tensors, this
// describes which dimension has a stride of 1 (i.e. is last in the dim
// order).
int32_t packed_dim;
// Describes if the packed dimension is padded to a multiple of 4. This will
// be true for all tensors that use texture storage, and will also be true
// for the PACKED_PADDED memory layouts.
bool packed_dim_padded;
// Describes a second level of packing, if applicable (which will only apply
// to the 4W4C and 4H4W layouts). If there is no second level of packing,
// then this will be equal to packed_dim. Otherwise, it will represent the
// outer dim used to construct block packing. For example, 4W4C will have
// packed_dim = 2 and outer_packed_dim = 0.
int32_t outer_packed_dim;
// Whether the outer packed dim is padded to the next multiple of 4. This is
// true only for block-packed layouts.
bool outer_packed_dim_padded;
// True if this layout uses block packing (i.e., outer_packed_dim !=
// packed_dim). Block packing is used for layouts like 4W4C and 4H4W.
bool is_block_packed;

PackedDimInfo(
int32_t dim,
bool dim_padded,
int32_t outer_dim,
bool outer_dim_padded);
};

struct LastAccess {
vkapi::PipelineStageFlags stage;
Expand All @@ -79,18 +71,6 @@ struct LastAccess {
: stage{stage_flags}, access{access_flags} {}
};

/*
* Calculate the number of elements that a GPU buffer would require to store the
* contents of a tensor. This will depend on the storage type and dtype of the
* tensor, as well as the features available on the device.
*/
int64_t calculate_gpu_buffer_numel(
Context* const context,
const std::vector<int64_t>& sizes,
const utils::uvec3 image_extents,
const utils::StorageType storage_type,
const vkapi::ScalarType dtype);

class vTensorStorage final {
public:
// Do not allow empty vTensorStorage construction
Expand All @@ -99,11 +79,11 @@ class vTensorStorage final {
vTensorStorage(
Context* context,
const utils::StorageType storage_type,
const utils::GPUMemoryLayout memory_layout,
const std::vector<int64_t>& axis_map,
const int32_t packed_dim,
const std::vector<int64_t>& sizes,
const PackedDimInfo& packed_dim_info,
const std::vector<int64_t>& padded_sizes,
const vkapi::ScalarType dtype,
const int64_t physical_numel,
const bool allocate_memory = true);

vTensorStorage(Context* const context, const vkapi::VulkanImage& image);
Expand Down Expand Up @@ -295,13 +275,13 @@ class vTensor final {
const std::vector<int64_t>& sizes,
const TextureLimits& logical_limits,
const std::vector<int64_t>& axis_map,
const int32_t packed_dim);
const PackedDimInfo& packed_dim_info);

void update(
const std::vector<int64_t>& sizes,
const TextureLimits& logical_limits,
const std::vector<int64_t>& axis_map,
const int32_t packed_dim);
const PackedDimInfo& packed_dim_info);
};

private:
Expand All @@ -310,16 +290,14 @@ class vTensor final {
* to construct a tensor.
*/

// Information about packed dimension padding and block packing
PackedDimInfo packed_dim_info_;
// Whether the tensor has elements of type float, int, etc.
vkapi::ScalarType dtype_;
// sizes of the tensor in NCHW dimension order
std::vector<int64_t> sizes_;
// Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
// width, 1 for height, etc.). For texture backed tensors, this describes
// which dimension is packed along a texel. For buffer backed tensors, this
// describes which dimension has a stride of 1 (i.e. is last in the dim
// order).
int32_t packed_dim_;
// padded sizes of the tensor (pre-computed to avoid recalculation)
std::vector<int64_t> padded_sizes_;

/*
* "Layout" metadata. These describe with further detail how tensor data is
Expand Down Expand Up @@ -353,6 +331,10 @@ class vTensor final {
// number of elements based on the canonical sizes
size_t numel_;

// number of elements required for GPU buffer storage (with padding/packing)
// This is pre-computed to avoid recomputing calculate_gpu_buffer_numel
int64_t physical_numel_;

// For texture backed tensors, this int32 contains the axis map data packed
// into a single int32. For buffer backed tensors, this int32 contains the
// wchn dim order data packed into a single int32.
Expand Down Expand Up @@ -483,7 +465,11 @@ class vTensor final {
utils::GPUMemoryLayout estimate_memory_layout() const;

inline int32_t packed_dim() const {
return packed_dim_;
return packed_dim_info_.packed_dim;
}

inline const PackedDimInfo& packed_dim_info() const {
return packed_dim_info_;
}

/*
Expand Down Expand Up @@ -514,10 +500,22 @@ class vTensor final {
return strides_;
}

inline const std::vector<int64_t>& padded_sizes() const {
return padded_sizes_;
}

inline size_t numel() const {
return numel_;
}

inline int64_t physical_numel() const {
return physical_numel_;
}

inline utils::uvec3 image_extents() const {
return storage_->image_extents_;
}

inline size_t nbytes() const {
return element_size(dtype()) * numel();
}
Expand Down
9 changes: 9 additions & 0 deletions backends/vulkan/runtime/graph/ComputeGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,10 @@ class ComputeGraph final {
return values_.at(idx).toConstTensor().staging_buffer_numel();
}

inline int64_t physical_numel_of(const ValueRef idx) const {
return values_.at(idx).toConstTensor().physical_numel();
}

inline utils::StorageType storage_type_of(const ValueRef idx) const {
return values_.at(idx).toConstTensor().storage_type();
}
Expand Down Expand Up @@ -442,6 +446,11 @@ class ComputeGraph final {
return values_.at(idx).toConstTensor().packed_dim();
}

inline const api::PackedDimInfo& packed_dim_info_of(
const ValueRef idx) const {
return values_.at(idx).toConstTensor().packed_dim_info();
}

inline int32_t concat_dim_of(const ValueRef idx) const {
return values_.at(idx).toConstTensor().concat_dim();
}
Expand Down
12 changes: 7 additions & 5 deletions backends/vulkan/runtime/graph/ops/impl/Concat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,17 +108,19 @@ utils::uvec3 concat_pick_global_wg_size(
// Calculate what the image extents would be of a tensor with the input
// volume's sizes. This produces the number of texels that would need to be
// written to.
const int32_t packed_dim = graph->packed_dim_of(out);

const int32_t packed_dim_idx = graph->packed_dim_of(out);
std::vector<int64_t> inp_volume_texel_sizes =
api::calculate_padded_sizes(inp_volume_sizes, packed_dim);
api::flip_and_unsqueeze<int64_t>(inp_volume_sizes, api::kTensorSizes, 1);

// If the concat_dim is the same as the packed dim, and the concat_offset for
// this input batch is not a multiple of 4, then the data from an input texel
// may be split up between two output texels. For example:
// I0 , I1 , I2 , I2
// O0 , O1 , O2 , X | X , X , X , X
// Therefore, 1 texel is added to the packed dim to account for this.
inp_volume_texel_sizes.at(3 - packed_dim) =
utils::div_up_4(inp_volume_texel_sizes.at(3 - packed_dim)) + 1;
inp_volume_texel_sizes.at(packed_dim_idx) =
utils::div_up_4(inp_volume_texel_sizes.at(packed_dim_idx)) + 1;

const uint32_t inp_volume_texel_numel =
utils::multiply_integers(inp_volume_texel_sizes);
Expand Down Expand Up @@ -324,7 +326,7 @@ void add_concat_node(
{1u, 1u, 1u},
{1u, 1u, 1u},
// Inputs and Outputs
{{concat_offset, vkapi::kWrite}},
{{concat_offset, vkapi::kReadWrite}},
// Parameter buffers
param_buffers,
// Push Constants
Expand Down
6 changes: 5 additions & 1 deletion backends/vulkan/runtime/graph/ops/impl/Tan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ void add_tan_node(ComputeGraph& graph, const ValueRef in, const ValueRef out) {
add_storage_type_suffix(kernel_name, graph.storage_type_of(out));

vkapi::ParamsBindList ubos({});
ubos.append({graph.logical_limits_ubo(out)});
if (graph.storage_type_of(out) == utils::kBuffer) {
ubos.append({graph.numel_ubo(out)});
} else {
ubos.append({graph.logical_limits_ubo(out)});
}

graph.execute_nodes().emplace_back(new DynamicDispatchNode(
graph,
Expand Down
3 changes: 3 additions & 0 deletions backends/vulkan/runtime/utils/StorageUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ namespace utils {

bool is_packed_int8_layout(const GPUMemoryLayout layout) {
switch (layout) {
case kPackedInt8_4W:
case kPackedInt8_4C:
case kPackedInt8_4H:
case kPackedInt8_4W4C:
case kPackedInt8_4H4W:
return true;
Expand Down
32 changes: 32 additions & 0 deletions backends/vulkan/runtime/utils/StorageUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ enum class GPUMemoryLayout : uint8_t {
* 16 element block is loaded, rather than 4 elements along one dimension.
*/

// "vector" packed layouts - single level of packing (4 elements along packed
// dim per int32)
TENSOR_PACKED_INT8_4W = 5u,
TENSOR_PACKED_INT8_4C = 6u,
TENSOR_PACKED_INT8_4H = 7u,

// Block packed layouts - two levels of packing (4x4 block composed of
// elements from two packed dims per ivec4)
TENSOR_PACKED_INT8_4W4C = 3u,
TENSOR_PACKED_INT8_4H4W = 4u,
};
Expand All @@ -114,6 +122,15 @@ static constexpr GPUMemoryLayout kHeightPacked =
static constexpr GPUMemoryLayout kChannelsPacked =
GPUMemoryLayout::TENSOR_CHANNELS_PACKED;

static constexpr GPUMemoryLayout kPackedInt8_4W =
GPUMemoryLayout::TENSOR_PACKED_INT8_4W;

static constexpr GPUMemoryLayout kPackedInt8_4C =
GPUMemoryLayout::TENSOR_PACKED_INT8_4C;

static constexpr GPUMemoryLayout kPackedInt8_4H =
GPUMemoryLayout::TENSOR_PACKED_INT8_4H;

static constexpr GPUMemoryLayout kPackedInt8_4W4C =
GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C;

Expand All @@ -129,6 +146,12 @@ T to_packed_dim(const GPUMemoryLayout layout) {
return 1;
case kChannelsPacked:
return 2;
case kPackedInt8_4W:
return 0;
case kPackedInt8_4C:
return 2;
case kPackedInt8_4H:
return 1;
case kPackedInt8_4W4C:
return 2;
case kPackedInt8_4H4W:
Expand Down Expand Up @@ -170,6 +193,15 @@ inline std::ostream& operator<<(
case kChannelsPacked:
os << "TENSOR_CHANNELS_PACKED";
break;
case kPackedInt8_4W:
os << "TENSOR_PACKED_INT8_4W";
break;
case kPackedInt8_4C:
os << "TENSOR_PACKED_INT8_4C";
break;
case kPackedInt8_4H:
os << "TENSOR_PACKED_INT8_4H";
break;
case kPackedInt8_4W4C:
os << "TENSOR_PACKED_INT8_4W4C";
break;
Expand Down
Loading
Loading