From a36ed5ad28928899e88e7a1f04c848cfd1500271 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 20 Mar 2026 16:05:24 -0400 Subject: [PATCH] add `vortex-compressor` Signed-off-by: Connor Tsui --- Cargo.lock | 20 +- Cargo.toml | 2 + fuzz/src/array/mod.rs | 15 +- vortex-array/public-api.lock | 8 + vortex-array/src/executor.rs | 1 + vortex-compressor/Cargo.toml | 34 + vortex-compressor/public-api.lock | 865 ++++++++++++++++++ vortex-compressor/src/builtins/constant.rs | 216 +++++ vortex-compressor/src/builtins/dict/float.rs | 152 +++ .../src/builtins/dict/integer.rs | 166 ++++ vortex-compressor/src/builtins/dict/mod.rs | 316 +++++++ vortex-compressor/src/builtins/mod.rs | 45 + vortex-compressor/src/compressor.rs | 525 +++++++++++ vortex-compressor/src/ctx.rs | 109 +++ vortex-compressor/src/lib.rs | 27 + vortex-compressor/src/sample.rs | 161 ++++ vortex-compressor/src/scheme.rs | 286 ++++++ vortex-compressor/src/stats/cache.rs | 133 +++ vortex-compressor/src/stats/float.rs | 315 +++++++ vortex-compressor/src/stats/integer.rs | 622 +++++++++++++ vortex-compressor/src/stats/mod.rs | 22 + vortex-compressor/src/stats/options.rs | 26 + vortex-compressor/src/stats/string.rs | 102 +++ vortex-file/src/strategy.rs | 42 +- vortex-layout/src/layouts/compressed.rs | 5 +- vortex/public-api.lock | 6 +- vortex/src/lib.rs | 5 +- 27 files changed, 4193 insertions(+), 33 deletions(-) create mode 100644 vortex-compressor/Cargo.toml create mode 100644 vortex-compressor/public-api.lock create mode 100644 vortex-compressor/src/builtins/constant.rs create mode 100644 vortex-compressor/src/builtins/dict/float.rs create mode 100644 vortex-compressor/src/builtins/dict/integer.rs create mode 100644 vortex-compressor/src/builtins/dict/mod.rs create mode 100644 vortex-compressor/src/builtins/mod.rs create mode 100644 vortex-compressor/src/compressor.rs create mode 100644 vortex-compressor/src/ctx.rs create mode 100644 vortex-compressor/src/lib.rs create mode 100644 vortex-compressor/src/sample.rs create mode 100644 vortex-compressor/src/scheme.rs create mode 100644 vortex-compressor/src/stats/cache.rs create mode 100644 vortex-compressor/src/stats/float.rs create mode 100644 vortex-compressor/src/stats/integer.rs create mode 100644 vortex-compressor/src/stats/mod.rs create mode 100644 vortex-compressor/src/stats/options.rs create mode 100644 vortex-compressor/src/stats/string.rs diff --git a/Cargo.lock b/Cargo.lock index 54014adfb1d..8fc152107c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9860,7 +9860,6 @@ name = "vortex-btrblocks" version = "0.1.0" dependencies = [ "codspeed-divan-compat", - "enum-iterator", "getrandom 0.4.2", "itertools 0.14.0", "num-traits", @@ -9873,6 +9872,7 @@ dependencies = [ "vortex-alp", "vortex-array", "vortex-buffer", + "vortex-compressor", "vortex-datetime-parts", "vortex-decimal-byte-parts", "vortex-error", @@ -9943,6 +9943,24 @@ dependencies = [ "vortex-session", ] +[[package]] +name = "vortex-compressor" +version = "0.1.0" +dependencies = [ + "itertools 0.14.0", + "num-traits", + "parking_lot", + "rand 0.10.0", + "rstest", + "rustc-hash", + "tracing", + "vortex-array", + "vortex-buffer", + "vortex-error", + "vortex-mask", + "vortex-utils", +] + [[package]] name = "vortex-cub" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 0d6853627a8..4a374e1e034 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ members = [ "vortex-proto", "vortex-array", "vortex-tensor", + "vortex-compressor", "vortex-btrblocks", "vortex-layout", "vortex-scan", @@ -259,6 +260,7 @@ vortex-array = { version = "0.1.0", path = "./vortex-array", default-features = vortex-btrblocks = { version = "0.1.0", path = "./vortex-btrblocks", default-features = false } vortex-buffer = { version = "0.1.0", path = "./vortex-buffer", default-features = false } vortex-bytebool = { version = "0.1.0", path = "./encodings/bytebool", default-features = false } +vortex-compressor = { version = "0.1.0", path = "./vortex-compressor", default-features = false } vortex-datafusion = { version = "0.1.0", path = "./vortex-datafusion", default-features = false } vortex-datetime-parts = { version = "0.1.0", path = "./encodings/datetime-parts", default-features = false } vortex-decimal-byte-parts = { version = "0.1.0", path = "encodings/decimal-byte-parts", default-features = false } diff --git a/fuzz/src/array/mod.rs b/fuzz/src/array/mod.rs index 0b101b91d8f..70094fed072 100644 --- a/fuzz/src/array/mod.rs +++ b/fuzz/src/array/mod.rs @@ -61,9 +61,10 @@ use vortex_array::search_sorted::SearchSorted; use vortex_array::search_sorted::SearchSortedSide; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; -use vortex_btrblocks::FloatCode; -use vortex_btrblocks::IntCode; -use vortex_btrblocks::StringCode; +use vortex_btrblocks::SchemeExt; +use vortex_btrblocks::schemes::float; +use vortex_btrblocks::schemes::integer; +use vortex_btrblocks::schemes::string; use vortex_error::VortexExpect; use vortex_error::vortex_panic; use vortex_mask::Mask; @@ -546,9 +547,11 @@ pub fn compress_array(array: &ArrayRef, strategy: CompressorStrategy) -> ArrayRe .compress(array) .vortex_expect("BtrBlocksCompressor compress should succeed in fuzz test"), CompressorStrategy::Compact => BtrBlocksCompressorBuilder::default() - .include_string([StringCode::Zstd]) - .include_int([IntCode::Pco]) - .include_float([FloatCode::Pco]) + .include([ + string::ZstdScheme.id(), + integer::PcoScheme.id(), + float::PcoScheme.id(), + ]) .build() .compress(array) .vortex_expect("Compact compress should succeed in fuzz test"), diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index c53d9a893cf..b47484e1adf 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -22266,6 +22266,14 @@ pub fn vortex_array::ExecutionCtx::new(session: vortex_session::VortexSession) - pub fn vortex_array::ExecutionCtx::session(&self) -> &vortex_session::VortexSession +impl core::clone::Clone for vortex_array::ExecutionCtx + +pub fn vortex_array::ExecutionCtx::clone(&self) -> vortex_array::ExecutionCtx + +impl core::fmt::Debug for vortex_array::ExecutionCtx + +pub fn vortex_array::ExecutionCtx::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + impl core::fmt::Display for vortex_array::ExecutionCtx pub fn vortex_array::ExecutionCtx::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result diff --git a/vortex-array/src/executor.rs b/vortex-array/src/executor.rs index da05450f8de..114adf355c3 100644 --- a/vortex-array/src/executor.rs +++ b/vortex-array/src/executor.rs @@ -172,6 +172,7 @@ impl dyn DynArray + '_ { /// /// Accumulates a trace of execution steps. Individual steps are logged at TRACE level for /// real-time following, and the full trace is dumped at DEBUG level when the context is dropped. +#[derive(Debug, Clone)] pub struct ExecutionCtx { id: usize, session: VortexSession, diff --git a/vortex-compressor/Cargo.toml b/vortex-compressor/Cargo.toml new file mode 100644 index 00000000000..260c9c531f5 --- /dev/null +++ b/vortex-compressor/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "vortex-compressor" +authors = { workspace = true } +categories = { workspace = true } +description = "Encoding-agnostic compression framework for Vortex arrays" +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +readme = { workspace = true } +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[dependencies] +itertools = { workspace = true } +num-traits = { workspace = true } +parking_lot = { workspace = true } +rand = { workspace = true } +rustc-hash = { workspace = true } +tracing = { workspace = true } +vortex-array = { workspace = true } +vortex-buffer = { workspace = true } +vortex-error = { workspace = true } +vortex-mask = { workspace = true } +vortex-utils = { workspace = true } + +[dev-dependencies] +rstest = { workspace = true } +vortex-array = { workspace = true, features = ["_test-harness"] } + +[lints] +workspace = true diff --git a/vortex-compressor/public-api.lock b/vortex-compressor/public-api.lock new file mode 100644 index 00000000000..3fbc28076eb --- /dev/null +++ b/vortex-compressor/public-api.lock @@ -0,0 +1,865 @@ +pub mod vortex_compressor + +pub mod vortex_compressor::builtins + +pub struct vortex_compressor::builtins::FloatConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::clone(&self) -> vortex_compressor::builtins::FloatConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::FloatConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::eq(&self, other: &vortex_compressor::builtins::FloatConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::FloatConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::FloatConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::FloatDictScheme + +impl core::clone::Clone for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::clone(&self) -> vortex_compressor::builtins::FloatDictScheme + +impl core::cmp::Eq for vortex_compressor::builtins::FloatDictScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::eq(&self, other: &vortex_compressor::builtins::FloatDictScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::FloatDictScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::FloatDictScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::IntConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::clone(&self) -> vortex_compressor::builtins::IntConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::IntConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::eq(&self, other: &vortex_compressor::builtins::IntConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::IntConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::IntConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::IntDictScheme + +impl core::clone::Clone for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::clone(&self) -> vortex_compressor::builtins::IntDictScheme + +impl core::cmp::Eq for vortex_compressor::builtins::IntDictScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::eq(&self, other: &vortex_compressor::builtins::IntDictScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::IntDictScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::IntDictScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::StringConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::clone(&self) -> vortex_compressor::builtins::StringConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::StringConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::eq(&self, other: &vortex_compressor::builtins::StringConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::StringConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::StringConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::StringDictScheme + +impl core::clone::Clone for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::clone(&self) -> vortex_compressor::builtins::StringDictScheme + +impl core::cmp::Eq for vortex_compressor::builtins::StringDictScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::eq(&self, other: &vortex_compressor::builtins::StringDictScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::StringDictScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::StringDictScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::builtins::float_dictionary_encode(stats: &vortex_compressor::stats::FloatStats) -> vortex_array::arrays::dict::array::DictArray + +pub fn vortex_compressor::builtins::integer_dictionary_encode(stats: &vortex_compressor::stats::IntegerStats) -> vortex_array::arrays::dict::array::DictArray + +pub fn vortex_compressor::builtins::is_float_primitive(canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::is_integer_primitive(canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::is_utf8_string(canonical: &vortex_array::canonical::Canonical) -> bool + +pub mod vortex_compressor::ctx + +pub struct vortex_compressor::ctx::CompressorContext + +impl vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::as_leaf(self) -> Self + +pub fn vortex_compressor::ctx::CompressorContext::as_sample(self) -> Self + +pub fn vortex_compressor::ctx::CompressorContext::cascade_history(&self) -> &[(vortex_compressor::scheme::SchemeId, usize)] + +pub fn vortex_compressor::ctx::CompressorContext::finished_cascading(&self) -> bool + +pub fn vortex_compressor::ctx::CompressorContext::is_sample(&self) -> bool + +pub fn vortex_compressor::ctx::CompressorContext::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::ctx::CompressorContext::with_stats_options(self, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +impl core::clone::Clone for vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::clone(&self) -> vortex_compressor::ctx::CompressorContext + +impl core::fmt::Debug for vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub const vortex_compressor::ctx::MAX_CASCADE: usize + +pub mod vortex_compressor::scheme + +pub enum vortex_compressor::scheme::ChildSelection + +pub vortex_compressor::scheme::ChildSelection::All + +pub vortex_compressor::scheme::ChildSelection::Many(&'static [usize]) + +pub vortex_compressor::scheme::ChildSelection::One(usize) + +impl vortex_compressor::scheme::ChildSelection + +pub fn vortex_compressor::scheme::ChildSelection::contains(&self, child_index: usize) -> bool + +impl core::clone::Clone for vortex_compressor::scheme::ChildSelection + +pub fn vortex_compressor::scheme::ChildSelection::clone(&self) -> vortex_compressor::scheme::ChildSelection + +impl core::fmt::Debug for vortex_compressor::scheme::ChildSelection + +pub fn vortex_compressor::scheme::ChildSelection::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::scheme::ChildSelection + +pub struct vortex_compressor::scheme::AncestorExclusion + +pub vortex_compressor::scheme::AncestorExclusion::ancestor: vortex_compressor::scheme::SchemeId + +pub vortex_compressor::scheme::AncestorExclusion::children: vortex_compressor::scheme::ChildSelection + +impl core::clone::Clone for vortex_compressor::scheme::AncestorExclusion + +pub fn vortex_compressor::scheme::AncestorExclusion::clone(&self) -> vortex_compressor::scheme::AncestorExclusion + +impl core::fmt::Debug for vortex_compressor::scheme::AncestorExclusion + +pub fn vortex_compressor::scheme::AncestorExclusion::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::scheme::AncestorExclusion + +pub struct vortex_compressor::scheme::DescendantExclusion + +pub vortex_compressor::scheme::DescendantExclusion::children: vortex_compressor::scheme::ChildSelection + +pub vortex_compressor::scheme::DescendantExclusion::excluded: vortex_compressor::scheme::SchemeId + +impl core::clone::Clone for vortex_compressor::scheme::DescendantExclusion + +pub fn vortex_compressor::scheme::DescendantExclusion::clone(&self) -> vortex_compressor::scheme::DescendantExclusion + +impl core::fmt::Debug for vortex_compressor::scheme::DescendantExclusion + +pub fn vortex_compressor::scheme::DescendantExclusion::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::scheme::DescendantExclusion + +pub struct vortex_compressor::scheme::SchemeId + +impl core::clone::Clone for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::clone(&self) -> vortex_compressor::scheme::SchemeId + +impl core::cmp::Eq for vortex_compressor::scheme::SchemeId + +impl core::cmp::PartialEq for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::eq(&self, other: &vortex_compressor::scheme::SchemeId) -> bool + +impl core::fmt::Debug for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::hash<__H: core::hash::Hasher>(&self, state: &mut __H) + +impl core::marker::Copy for vortex_compressor::scheme::SchemeId + +impl core::marker::StructuralPartialEq for vortex_compressor::scheme::SchemeId + +pub trait vortex_compressor::scheme::Scheme: core::fmt::Debug + core::marker::Send + core::marker::Sync + +pub fn vortex_compressor::scheme::Scheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::scheme::Scheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::scheme::Scheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::scheme::Scheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::scheme::Scheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::scheme::Scheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::scheme::Scheme::num_children(&self) -> usize + +pub fn vortex_compressor::scheme::Scheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::scheme::Scheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub trait vortex_compressor::scheme::SchemeExt: vortex_compressor::scheme::Scheme + +pub fn vortex_compressor::scheme::SchemeExt::id(&self) -> vortex_compressor::scheme::SchemeId + +impl vortex_compressor::scheme::SchemeExt for T + +pub fn T::id(&self) -> vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::estimate_compression_ratio_with_sampling(scheme: &S, compressor: &vortex_compressor::CascadingCompressor, array: &vortex_array::array::ArrayRef, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub mod vortex_compressor::stats + +pub enum vortex_compressor::stats::FloatErasedStats + +pub vortex_compressor::stats::FloatErasedStats::F16(vortex_compressor::stats::FloatTypedStats) + +pub vortex_compressor::stats::FloatErasedStats::F32(vortex_compressor::stats::FloatTypedStats) + +pub vortex_compressor::stats::FloatErasedStats::F64(vortex_compressor::stats::FloatTypedStats) + +impl core::clone::Clone for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::clone(&self) -> vortex_compressor::stats::FloatErasedStats + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::fmt::Debug for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub enum vortex_compressor::stats::IntegerErasedStats + +pub vortex_compressor::stats::IntegerErasedStats::I16(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::I32(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::I64(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::I8(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U16(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U32(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U64(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U8(vortex_compressor::stats::IntegerTypedStats) + +impl vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::distinct_count(&self) -> core::option::Option + +pub fn vortex_compressor::stats::IntegerErasedStats::max_ilog2(&self) -> core::option::Option + +pub fn vortex_compressor::stats::IntegerErasedStats::max_minus_min(&self) -> u64 + +pub fn vortex_compressor::stats::IntegerErasedStats::min_is_negative(&self) -> bool + +pub fn vortex_compressor::stats::IntegerErasedStats::min_is_zero(&self) -> bool + +pub fn vortex_compressor::stats::IntegerErasedStats::most_frequent_value_and_count(&self) -> core::option::Option<(vortex_array::scalar::typed_view::primitive::pvalue::PValue, u32)> + +impl core::clone::Clone for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::clone(&self) -> vortex_compressor::stats::IntegerErasedStats + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::fmt::Debug for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::ArrayAndStats + +impl vortex_compressor::stats::ArrayAndStats + +pub fn vortex_compressor::stats::ArrayAndStats::array(&self) -> &vortex_array::array::ArrayRef + +pub fn vortex_compressor::stats::ArrayAndStats::float_stats(&mut self) -> &vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::ArrayAndStats::get_or_insert_with(&mut self, f: impl core::ops::function::FnOnce() -> T) -> &T + +pub fn vortex_compressor::stats::ArrayAndStats::integer_stats(&mut self) -> &vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::ArrayAndStats::into_array(self) -> vortex_array::array::ArrayRef + +pub fn vortex_compressor::stats::ArrayAndStats::new(array: vortex_array::array::ArrayRef, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::ArrayAndStats::string_stats(&mut self) -> &vortex_compressor::stats::StringStats + +pub struct vortex_compressor::stats::FloatDistinctInfo + +impl vortex_compressor::stats::FloatDistinctInfo + +pub fn vortex_compressor::stats::FloatDistinctInfo::distinct_values(&self) -> &vortex_utils::aliases::hash_set::HashSet, rustc_hash::FxBuildHasher> + +impl core::clone::Clone for vortex_compressor::stats::FloatDistinctInfo + +pub fn vortex_compressor::stats::FloatDistinctInfo::clone(&self) -> vortex_compressor::stats::FloatDistinctInfo + +impl core::fmt::Debug for vortex_compressor::stats::FloatDistinctInfo + +pub fn vortex_compressor::stats::FloatDistinctInfo::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::FloatStats + +impl vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::average_run_length(&self) -> u32 + +pub fn vortex_compressor::stats::FloatStats::erased(&self) -> &vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatStats::generate(input: &vortex_array::arrays::primitive::array::PrimitiveArray) -> Self + +pub fn vortex_compressor::stats::FloatStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::FloatStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::FloatStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray + +pub fn vortex_compressor::stats::FloatStats::value_count(&self) -> u32 + +impl vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::distinct_count(&self) -> core::option::Option + +impl core::clone::Clone for vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::clone(&self) -> vortex_compressor::stats::FloatStats + +impl core::fmt::Debug for vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::FloatTypedStats + +impl vortex_compressor::stats::FloatTypedStats + +pub fn vortex_compressor::stats::FloatTypedStats::distinct(&self) -> core::option::Option<&vortex_compressor::stats::FloatDistinctInfo> + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::clone::Clone for vortex_compressor::stats::FloatTypedStats + +pub fn vortex_compressor::stats::FloatTypedStats::clone(&self) -> vortex_compressor::stats::FloatTypedStats + +impl core::fmt::Debug for vortex_compressor::stats::FloatTypedStats + +pub fn vortex_compressor::stats::FloatTypedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::GenerateStatsOptions + +pub vortex_compressor::stats::GenerateStatsOptions::count_distinct_values: bool + +impl vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::merge(self, other: Self) -> Self + +impl core::clone::Clone for vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::clone(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl core::default::Default for vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::default() -> vortex_compressor::stats::GenerateStatsOptions + +impl core::fmt::Debug for vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::stats::IntegerDistinctInfo + +impl vortex_compressor::stats::IntegerDistinctInfo + +pub fn vortex_compressor::stats::IntegerDistinctInfo::distinct_values(&self) -> &vortex_utils::aliases::hash_map::HashMap, u32, rustc_hash::FxBuildHasher> + +impl core::clone::Clone for vortex_compressor::stats::IntegerDistinctInfo + +pub fn vortex_compressor::stats::IntegerDistinctInfo::clone(&self) -> vortex_compressor::stats::IntegerDistinctInfo + +impl core::fmt::Debug for vortex_compressor::stats::IntegerDistinctInfo + +pub fn vortex_compressor::stats::IntegerDistinctInfo::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::IntegerStats + +impl vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::average_run_length(&self) -> u32 + +pub fn vortex_compressor::stats::IntegerStats::erased(&self) -> &vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerStats::generate(input: &vortex_array::arrays::primitive::array::PrimitiveArray) -> Self + +pub fn vortex_compressor::stats::IntegerStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::IntegerStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::IntegerStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray + +pub fn vortex_compressor::stats::IntegerStats::value_count(&self) -> u32 + +impl vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::distinct_count(&self) -> core::option::Option + +pub fn vortex_compressor::stats::IntegerStats::most_frequent_value_and_count(&self) -> core::option::Option<(vortex_array::scalar::typed_view::primitive::pvalue::PValue, u32)> + +impl core::clone::Clone for vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::clone(&self) -> vortex_compressor::stats::IntegerStats + +impl core::fmt::Debug for vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::IntegerTypedStats + +impl vortex_compressor::stats::IntegerTypedStats + +pub fn vortex_compressor::stats::IntegerTypedStats::distinct(&self) -> core::option::Option<&vortex_compressor::stats::IntegerDistinctInfo> + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::clone::Clone for vortex_compressor::stats::IntegerTypedStats + +pub fn vortex_compressor::stats::IntegerTypedStats::clone(&self) -> vortex_compressor::stats::IntegerTypedStats + +impl core::fmt::Debug for vortex_compressor::stats::IntegerTypedStats + +pub fn vortex_compressor::stats::IntegerTypedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::StringStats + +impl vortex_compressor::stats::StringStats + +pub fn vortex_compressor::stats::StringStats::estimated_distinct_count(&self) -> core::option::Option + +pub fn vortex_compressor::stats::StringStats::generate(input: &vortex_array::arrays::varbinview::array::VarBinViewArray) -> Self + +pub fn vortex_compressor::stats::StringStats::generate_opts(input: &vortex_array::arrays::varbinview::array::VarBinViewArray, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::StringStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::StringStats::source(&self) -> &vortex_array::arrays::varbinview::array::VarBinViewArray + +pub fn vortex_compressor::stats::StringStats::value_count(&self) -> u32 + +impl core::clone::Clone for vortex_compressor::stats::StringStats + +pub fn vortex_compressor::stats::StringStats::clone(&self) -> vortex_compressor::stats::StringStats + +impl core::fmt::Debug for vortex_compressor::stats::StringStats + +pub fn vortex_compressor::stats::StringStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::CascadingCompressor + +impl vortex_compressor::CascadingCompressor + +pub fn vortex_compressor::CascadingCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult + +pub fn vortex_compressor::CascadingCompressor::compress_child(&self, child: &vortex_array::array::ArrayRef, parent_ctx: &vortex_compressor::ctx::CompressorContext, parent_id: vortex_compressor::scheme::SchemeId, child_index: usize) -> vortex_error::VortexResult + +pub fn vortex_compressor::CascadingCompressor::execution_ctx(&self) -> parking_lot::mutex::MutexGuard<'_, vortex_array::executor::ExecutionCtx> + +pub fn vortex_compressor::CascadingCompressor::new(schemes: alloc::vec::Vec<&'static dyn vortex_compressor::scheme::Scheme>) -> Self + +impl core::clone::Clone for vortex_compressor::CascadingCompressor + +pub fn vortex_compressor::CascadingCompressor::clone(&self) -> vortex_compressor::CascadingCompressor + +impl core::fmt::Debug for vortex_compressor::CascadingCompressor + +pub fn vortex_compressor::CascadingCompressor::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result diff --git a/vortex-compressor/src/builtins/constant.rs b/vortex-compressor/src/builtins/constant.rs new file mode 100644 index 00000000000..178f67e3e9d --- /dev/null +++ b/vortex-compressor/src/builtins/constant.rs @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Constant encoding schemes for integer, float, and string arrays. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::aggregate_fn::fns::is_constant::is_constant; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::MaskedArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::ValidityHelper; +use vortex_error::VortexResult; + +use super::is_float_primitive; +use super::is_integer_primitive; +use super::is_utf8_string; +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::scheme::Scheme; +use crate::stats::ArrayAndStats; + +/// Constant encoding for integer arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntConstantScheme; + +impl Scheme for IntConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.integer_stats(); + + if stats.distinct_count().is_none_or(|count| count > 1) { + return Ok(0.0); + } + + Ok(stats.value_count() as f64) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let source = data.integer_stats().source().clone(); + compress_constant_primitive(&source) + } +} + +/// Constant encoding for float arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatConstantScheme; + +impl Scheme for FloatConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.float_stats(); + + if stats.null_count() as usize == stats.source().len() || stats.value_count() == 0 { + return Ok(0.0); + } + + if stats.distinct_count().is_some_and(|count| count == 1) { + return Ok(stats.value_count() as f64); + } + + Ok(0.0) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let source = data.float_stats().source().clone(); + compress_constant_primitive(&source) + } +} + +/// Constant encoding for string arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringConstantScheme; + +impl Scheme for StringConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.string_stats(); + + if stats.estimated_distinct_count().is_none_or(|c| c > 1) + || !is_constant( + &stats.source().clone().into_array(), + &mut compressor.execution_ctx(), + )? + { + return Ok(0.0); + } + + // Force constant in these cases. + Ok(f64::MAX) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + let scalar_idx = + (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); + + match scalar_idx { + Some(idx) => { + let scalar = stats.source().scalar_at(idx)?; + let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); + if !stats.source().all_valid()? { + Ok( + MaskedArray::try_new(const_arr, stats.source().validity().clone())? + .into_array(), + ) + } else { + Ok(const_arr) + } + } + None => Ok(ConstantArray::new( + Scalar::null(stats.source().dtype().clone()), + stats.source().len(), + ) + .into_array()), + } + } +} + +/// Shared helper for compressing a constant primitive array (int or float). +fn compress_constant_primitive(source: &PrimitiveArray) -> VortexResult { + let scalar_idx = (0..source.len()).position(|idx| source.is_valid(idx).unwrap_or(false)); + + match scalar_idx { + Some(idx) => { + let scalar = source.scalar_at(idx)?; + let const_arr = ConstantArray::new(scalar, source.len()).into_array(); + if !source.all_valid()? { + Ok(MaskedArray::try_new(const_arr, source.validity().clone())?.into_array()) + } else { + Ok(const_arr) + } + } + None => { + Ok(ConstantArray::new(Scalar::null(source.dtype().clone()), source.len()).into_array()) + } + } +} diff --git a/vortex-compressor/src/builtins/dict/float.rs b/vortex-compressor/src/builtins/dict/float.rs new file mode 100644 index 00000000000..d9a7af35e16 --- /dev/null +++ b/vortex-compressor/src/builtins/dict/float.rs @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Float-specific dictionary encoding implementation. +//! +//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for +//! external compatibility. + +use vortex_array::IntoArray; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::half::f16; +use vortex_array::validity::Validity; +use vortex_array::vtable::ValidityHelper; +use vortex_buffer::Buffer; +use vortex_error::VortexExpect; + +use crate::stats::FloatErasedStats; +use crate::stats::FloatStats; + +/// Encodes a typed float array into a [`DictArray`] using the pre-computed distinct values. +macro_rules! typed_encode { + ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ + let distinct = $typed.distinct().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect(); + + let max_code = values.len(); + let codes = if max_code <= u8::MAX as usize { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + } else if max_code <= u16::MAX as usize { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + } else { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + }; + + let values_validity = match $validity { + Validity::NonNullable => Validity::NonNullable, + _ => Validity::AllValid, + }; + let values = PrimitiveArray::new(values, values_validity).into_array(); + + // SAFETY: enforced by the DictEncoder. + unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) } + }}; +} + +/// Compresses a floating-point array into a dictionary array according to attached stats. +pub fn dictionary_encode(stats: &FloatStats) -> DictArray { + let validity = stats.source().validity(); + match stats.erased() { + FloatErasedStats::F16(typed) => typed_encode!(stats, typed, validity, f16), + FloatErasedStats::F32(typed) => typed_encode!(stats, typed, validity, f32), + FloatErasedStats::F64(typed) => typed_encode!(stats, typed, validity, f64), + } +} + +/// Stateless encoder that maps values to dictionary codes via a `HashMap`. +struct DictEncoder; + +/// Trait for encoding values of type `T` into codes of type `I`. +trait Encode { + /// Using the distinct value set, turn the values into a set of codes. + fn encode(distinct: &[T], values: &[T]) -> Buffer; +} + +/// Implements [`Encode`] for a float type using its bit representation as the hash key. +macro_rules! impl_encode { + ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); }; + ($typ:ty, $utyp:ty, $($ityp:ty),+) => { + $( + impl Encode<$typ, $ityp> for DictEncoder { + #[allow(clippy::cast_possible_truncation)] + fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { + let mut codes = + vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity( + distinct.len(), + ); + for (code, &value) in distinct.iter().enumerate() { + codes.insert(value.to_bits(), code as $ityp); + } + + let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); + for value in values { + // Any code lookups which fail are for nulls, so their value does not matter. + output.push(codes.get(&value.to_bits()).copied().unwrap_or_default()); + } + + output.freeze() + } + } + )* + }; +} + +impl_encode!(f16, u16); +impl_encode!(f32, u32); +impl_encode!(f64, u64); + +#[cfg(test)] +mod tests { + use vortex_array::DynArray; + use vortex_array::IntoArray; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + + use super::dictionary_encode; + use crate::stats::FloatStats; + use crate::stats::GenerateStatsOptions; + + #[test] + fn test_float_dict_encode() { + let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32]; + let validity = + Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); + let array = PrimitiveArray::new(values, validity); + + let stats = FloatStats::generate_opts( + &array, + GenerateStatsOptions { + count_distinct_values: true, + }, + ); + let dict_array = dictionary_encode(&stats); + assert_eq!(dict_array.values().len(), 2); + assert_eq!(dict_array.codes().len(), 5); + + let expected = PrimitiveArray::new( + buffer![1f32, 2f32, 2f32, 1f32, 1f32], + Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), + ) + .into_array(); + assert_arrays_eq!(dict_array.as_ref(), expected.as_ref()); + } +} diff --git a/vortex-compressor/src/builtins/dict/integer.rs b/vortex-compressor/src/builtins/dict/integer.rs new file mode 100644 index 00000000000..00ec39ae1a9 --- /dev/null +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Dictionary compressor that reuses the unique values in the [`IntegerStats`]. +//! +//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted +//! for external compatibility. + +use vortex_array::IntoArray; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::validity::Validity; +use vortex_array::vtable::ValidityHelper; +use vortex_buffer::Buffer; +use vortex_error::VortexExpect; + +use crate::stats::IntegerErasedStats; +use crate::stats::IntegerStats; + +/// Encodes a typed integer array into a [`DictArray`] using the pre-computed distinct values. +macro_rules! typed_encode { + ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ + let distinct = $typed.distinct().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect(); + + let max_code = values.len(); + let codes = if max_code <= u8::MAX as usize { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + } else if max_code <= u16::MAX as usize { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + } else { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + }; + + let values_validity = match $validity { + Validity::NonNullable => Validity::NonNullable, + _ => Validity::AllValid, + }; + + let values = PrimitiveArray::new(values, values_validity).into_array(); + // SAFETY: invariants enforced in DictEncoder. + unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) } + }}; +} + +/// Compresses an integer array into a dictionary array according to attached stats. +#[expect( + clippy::cognitive_complexity, + reason = "complexity from match on all integer types" +)] +pub fn dictionary_encode(stats: &IntegerStats) -> DictArray { + let src_validity = stats.source().validity(); + + match stats.erased() { + IntegerErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8), + IntegerErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16), + IntegerErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32), + IntegerErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64), + IntegerErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8), + IntegerErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16), + IntegerErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32), + IntegerErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64), + } +} + +/// Stateless encoder that maps values to dictionary codes via a `HashMap`. +struct DictEncoder; + +/// Trait for encoding values of type `T` into codes of type `I`. +trait Encode { + /// Using the distinct value set, turn the values into a set of codes. + fn encode(distinct: &[T], values: &[T]) -> Buffer; +} + +/// Implements [`Encode`] for an integer type with all code width variants (u8, u16, u32). +macro_rules! impl_encode { + ($typ:ty) => { impl_encode!($typ, u8, u16, u32); }; + ($typ:ty, $($ityp:ty),+) => { + $( + impl Encode<$typ, $ityp> for DictEncoder { + #[allow(clippy::cast_possible_truncation)] + fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { + let mut codes = + vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity( + distinct.len(), + ); + for (code, &value) in distinct.iter().enumerate() { + codes.insert(value, code as $ityp); + } + + let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); + for value in values { + // Any code lookups which fail are for nulls, so their value does not matter. + // SAFETY: we have exactly sized output to be as large as values. + unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) }; + } + + output.freeze() + } + } + )* + }; +} + +impl_encode!(u8); +impl_encode!(u16); +impl_encode!(u32); +impl_encode!(u64); +impl_encode!(i8); +impl_encode!(i16); +impl_encode!(i32); +impl_encode!(i64); + +#[cfg(test)] +mod tests { + use vortex_array::DynArray; + use vortex_array::IntoArray; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + + use super::dictionary_encode; + use crate::stats::IntegerStats; + + #[test] + fn test_dict_encode_integer_stats() { + let data = buffer![100i32, 200, 100, 0, 100]; + let validity = + Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); + let array = PrimitiveArray::new(data, validity); + + let stats = IntegerStats::generate_opts( + &array, + crate::stats::GenerateStatsOptions { + count_distinct_values: true, + }, + ); + let dict_array = dictionary_encode(&stats); + assert_eq!(dict_array.values().len(), 2); + assert_eq!(dict_array.codes().len(), 5); + + let expected = PrimitiveArray::new( + buffer![100i32, 200, 100, 100, 100], + Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), + ) + .into_array(); + assert_arrays_eq!(dict_array.as_ref(), expected.as_ref()); + } +} diff --git a/vortex-compressor/src/builtins/dict/mod.rs b/vortex-compressor/src/builtins/dict/mod.rs new file mode 100644 index 00000000000..c8d38dcf56c --- /dev/null +++ b/vortex-compressor/src/builtins/dict/mod.rs @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Dictionary encoding schemes for integer, float, and string arrays. + +pub mod float; +pub mod integer; + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::DictArray; +use vortex_array::builders::dict::dict_encode; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use super::is_float_primitive; +use super::is_integer_primitive; +use super::is_utf8_string; +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::scheme::ChildSelection; +use crate::scheme::DescendantExclusion; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::scheme::estimate_compression_ratio_with_sampling; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// Dictionary encoding for low-cardinality integer values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntDictScheme; + +impl Scheme for IntDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + if stats.value_count() == 0 { + return Ok(0.0); + } + + let distinct_values_count = stats.distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + // If > 50% of the values are distinct, skip dict. + if distinct_values_count > stats.value_count() / 2 { + return Ok(0.0); + } + + // Ignore nulls encoding for the estimate. We only focus on values. + let values_size = stats.source().ptype().bit_width() * distinct_values_count as usize; + + // Assume codes are compressed RLE + BitPacking. + let codes_bw = usize::BITS - distinct_values_count.leading_zeros(); + + let n_runs = (stats.value_count() / stats.average_run_length()) as usize; + + // Assume that codes will either be BitPack or RLE-BitPack. + let codes_size_bp = (codes_bw * stats.value_count()) as usize; + let codes_size_rle_bp = usize::checked_mul((codes_bw + 32) as usize, n_runs); + + let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); + + let before = stats.value_count() as usize * stats.source().ptype().bit_width(); + + Ok(before as f64 / (values_size + codes_size) as f64) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + let dict = integer::dictionary_encode(stats); + + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + // SAFETY: compressing codes does not change their values. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(dict.has_all_values_referenced()) + .into_array(), + ) + } + } +} + +/// Dictionary encoding for low-cardinality float values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatDictScheme; + +impl Scheme for FloatDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + /// Float dict codes (child 1) are compact unsigned integers that should not be + /// dict-encoded again. Float dict values (child 0) flow through ALP into integer-land, + /// where integer dict encoding is redundant since the values are already deduplicated at + /// the float level. + /// + /// Additional exclusions for codes (IntSequenceScheme, IntRunEndScheme, FoRScheme, + /// ZigZagScheme, SparseScheme, RLE) are expressed as pull rules on those schemes in + /// vortex-btrblocks. + fn descendant_exclusions(&self) -> Vec { + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(0), + }, + ] + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + if stats.value_count() == 0 { + return Ok(0.0); + } + + if stats + .distinct_count() + .is_some_and(|count| count <= stats.value_count() / 2) + { + return estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx); + } + + Ok(0.0) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + let dict = float::dictionary_encode(stats); + let has_all_values_referenced = dict.has_all_values_referenced(); + // let DictArrayParts { codes, values, .. } = dict.into_parts(); + + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + // SAFETY: compressing codes or values does not alter the invariants. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(has_all_values_referenced) + .into_array(), + ) + } + } +} + +/// Dictionary encoding for low-cardinality string values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringDictScheme; + +impl Scheme for StringDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + /// String dict codes (child 1) are compact unsigned integers that should not be dict-encoded + /// again. + /// + /// Additional exclusions for codes (IntSequenceScheme, FoRScheme, ZigZagScheme, SparseScheme, + /// RunEndScheme, RLE, etc.) are expressed as pull rules on those schemes in `vortex-btrblocks`. + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }] + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + if stats + .estimated_distinct_count() + .is_none_or(|c| c > stats.value_count() / 2) + { + return Ok(0.0); + } + + if stats.value_count() == 0 { + return Ok(0.0); + } + + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + let dict = dict_encode(&stats.source().clone().into_array())?; + + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + // SAFETY: compressing codes or values does not alter the invariants. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(dict.has_all_values_referenced()) + .into_array(), + ) + } + } +} diff --git a/vortex-compressor/src/builtins/mod.rs b/vortex-compressor/src/builtins/mod.rs new file mode 100644 index 00000000000..704453fb40b --- /dev/null +++ b/vortex-compressor/src/builtins/mod.rs @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Built-in compression schemes that use only `vortex-array` encodings. +//! +//! These schemes produce arrays using types already in `vortex-array` ([`ConstantArray`], +//! [`DictArray`], [`MaskedArray`], etc.) and have no external encoding crate dependencies. +//! +//! [`ConstantArray`]: vortex_array::arrays::ConstantArray +//! [`DictArray`]: vortex_array::arrays::DictArray +//! [`MaskedArray`]: vortex_array::arrays::MaskedArray + +pub use constant::FloatConstantScheme; +pub use constant::IntConstantScheme; +pub use constant::StringConstantScheme; +pub use dict::FloatDictScheme; +pub use dict::IntDictScheme; +pub use dict::StringDictScheme; +pub use dict::float::dictionary_encode as float_dictionary_encode; +pub use dict::integer::dictionary_encode as integer_dictionary_encode; + +mod constant; +mod dict; + +use vortex_array::Canonical; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; + +/// Returns `true` if the canonical array is a primitive with an integer ptype. +pub fn is_integer_primitive(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Primitive(p) if p.ptype().is_int()) +} + +/// Returns `true` if the canonical form represents a floating-point primitive. +pub fn is_float_primitive(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Primitive(p) if !p.ptype().is_int()) +} + +/// Returns `true` if the canonical array is a UTF-8 string type. +pub fn is_utf8_string(canonical: &Canonical) -> bool { + matches!(canonical, + Canonical::VarBinView(v) if + v.dtype().eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) + ) +} diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs new file mode 100644 index 00000000000..aad6cc7e33e --- /dev/null +++ b/vortex-compressor/src/compressor.rs @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Cascading array compression implementation. + +use std::sync::Arc; + +use parking_lot::Mutex; +use parking_lot::MutexGuard; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::CanonicalValidity; +use vortex_array::DynArray; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::ToCanonical; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::FixedSizeListArray; +use vortex_array::arrays::ListArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::listview::list_from_list_view; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::ValidityHelper; +use vortex_error::VortexResult; + +use crate::builtins::IntDictScheme; +use crate::ctx::CompressorContext; +use crate::scheme::ChildSelection; +use crate::scheme::DescendantExclusion; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::scheme::SchemeId; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// The implicit root scheme ID for the compressor's own cascading (e.g. list offset compression). +/// +/// This is the **only** [`SchemeId`] that is not auto-provided via [`SchemeExt`]. +const ROOT_SCHEME_ID: SchemeId = SchemeId { + name: "vortex.compressor.root", +}; + +/// Child indices for the compressor's list/listview compression. +mod root_list_children { + /// List/ListView offsets child. + pub const OFFSETS: usize = 1; + /// ListView sizes child. + pub const SIZES: usize = 2; +} + +/// The main compressor type implementing cascading adaptive compression. +/// +/// This compressor applies adaptive compression [`Scheme`]s to arrays based on their data types and +/// characteristics. It recursively compresses nested structures like structs and lists, and chooses +/// optimal compression schemes for leaf types. +/// +/// The compressor works by: +/// 1. Canonicalizing input arrays to a standard representation. +/// 2. Pre-filtering schemes by [`Scheme::matches`] and exclusion rules. +/// 3. Evaluating each matching scheme's compression ratio on a sample. +/// 4. Compressing with the best scheme and verifying the result is smaller. +/// +/// No scheme may appear twice in a cascade chain. The compressor enforces this automatically +/// along with push/pull exclusion rules declared by each scheme. +#[derive(Debug, Clone)] +pub struct CascadingCompressor { + /// The enabled compression schemes. + schemes: Vec<&'static dyn Scheme>, + + /// Descendant exclusion rules for the compressor's own cascading (e.g. excluding Dict from + /// list offsets). + root_exclusions: Vec, + + /// Shared execution context for array operations during compression. + /// + /// This should have low contention as we only execute arrays one at a time during compression. + ctx: Arc>, +} + +impl CascadingCompressor { + /// Creates a new compressor with the given schemes. + /// + /// Root-level exclusion rules (e.g. excluding Dict from list offsets) are built + /// automatically. + pub fn new(schemes: Vec<&'static dyn Scheme>) -> Self { + // Root exclusion: exclude IntDict from list/listview offsets (monotonically + // increasing data where dictionary encoding is wasteful). + let root_exclusions = vec![DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(root_list_children::OFFSETS), + }]; + Self { + schemes, + root_exclusions, + // TODO(connor): The caller should probably pass this in. + ctx: Arc::new(Mutex::new(LEGACY_SESSION.create_execution_ctx())), + } + } + + /// Returns a mutable borrow of the execution context. + pub fn execution_ctx(&self) -> MutexGuard<'_, ExecutionCtx> { + self.ctx.lock() + } + + /// Compresses a child array produced by a cascading scheme. + /// + /// If the cascade budget is exhausted, the canonical array is returned as-is. Otherwise, + /// the child context is created by descending and recording the parent scheme + child + /// index, and compression proceeds normally. + /// + /// # Errors + /// + /// Returns an error if compression fails. + pub fn compress_child( + &self, + child: &ArrayRef, + parent_ctx: &CompressorContext, + parent_id: SchemeId, + child_index: usize, + ) -> VortexResult { + if parent_ctx.finished_cascading() { + return Ok(child.clone()); + } + + let canonical = child + .clone() + .execute::(&mut self.execution_ctx())? + .0; + let compact = canonical.compact()?; + + let child_ctx = parent_ctx + .clone() + .descend_with_scheme(parent_id, child_index); + self.compress_canonical(compact, child_ctx) + } + + /// Compresses an array using cascading adaptive compression. + /// + /// First canonicalizes and compacts the array, then applies optimal compression schemes. + /// + /// # Errors + /// + /// Returns an error if canonicalization or compression fails. + pub fn compress(&self, array: &ArrayRef) -> VortexResult { + let canonical = array + .clone() + .execute::(&mut self.execution_ctx())? + .0; + + // Compact it, removing any wasted space before we attempt to compress it. + let compact = canonical.compact()?; + + self.compress_canonical(compact, CompressorContext::new()) + } + + /// Compresses a canonical array by dispatching to type-specific logic. + /// + /// # Errors + /// + /// Returns an error if compression of any sub-array fails. + fn compress_canonical( + &self, + array: Canonical, + ctx: CompressorContext, + ) -> VortexResult { + match array { + Canonical::Null(null_array) => Ok(null_array.into_array()), + Canonical::Bool(bool_array) => Ok(bool_array.into_array()), + Canonical::Primitive(primitive) => { + self.choose_and_compress(Canonical::Primitive(primitive), ctx) + } + Canonical::Decimal(decimal) => { + self.choose_and_compress(Canonical::Decimal(decimal), ctx) + } + Canonical::Struct(struct_array) => { + let fields = struct_array + .unmasked_fields() + .iter() + .map(|field| self.compress(field)) + .collect::, _>>()?; + + Ok(StructArray::try_new( + struct_array.names().clone(), + fields, + struct_array.len(), + struct_array.validity().clone(), + )? + .into_array()) + } + Canonical::List(list_view_array) => { + if list_view_array.is_zero_copy_to_list() || list_view_array.elements().is_empty() { + let list_array = list_from_list_view(list_view_array)?; + self.compress_list_array(list_array, ctx) + } else { + self.compress_list_view_array(list_view_array, ctx) + } + } + Canonical::FixedSizeList(fsl_array) => { + let compressed_elems = self.compress(fsl_array.elements())?; + + Ok(FixedSizeListArray::try_new( + compressed_elems, + fsl_array.list_size(), + fsl_array.validity().clone(), + fsl_array.len(), + )? + .into_array()) + } + Canonical::VarBinView(strings) => { + if strings + .dtype() + .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) + { + self.choose_and_compress(Canonical::VarBinView(strings), ctx) + } else { + // We do not compress binary arrays. + Ok(strings.into_array()) + } + } + Canonical::Extension(ext_array) => { + let before_nbytes = ext_array.as_ref().nbytes(); + + // Try scheme-based compression first. + let result = + self.choose_and_compress(Canonical::Extension(ext_array.clone()), ctx)?; + if result.nbytes() < before_nbytes { + return Ok(result); + } + + // Otherwise, fall back to compressing the underlying storage array. + let compressed_storage = self.compress(ext_array.storage_array())?; + + Ok( + ExtensionArray::new(ext_array.ext_dtype().clone(), compressed_storage) + .into_array(), + ) + } + } + } + + /// The main scheme-selection entry point for a single leaf array. + /// + /// Filters allowed schemes by [`matches`] and exclusion rules, merges their [`stats_options`] + /// into a single [`GenerateStatsOptions`], then delegates to [`choose_scheme`] to pick the + /// winner by estimated compression ratio. + /// + /// If a winner is found and its compressed output is actually smaller, that output is returned. + /// Otherwise, the original array is returned unchanged. + /// + /// Empty and all-null arrays are short-circuited before any scheme evaluation. + /// + /// [`matches`]: Scheme::matches + /// [`stats_options`]: Scheme::stats_options + /// [`choose_scheme`]: Self::choose_scheme + fn choose_and_compress( + &self, + canonical: Canonical, + ctx: CompressorContext, + ) -> VortexResult { + let eligible_schemes: Vec<&'static dyn Scheme> = self + .schemes + .iter() + .copied() + .filter(|s| s.matches(&canonical) && !self.is_excluded(*s, &ctx)) + .collect(); + + let array: ArrayRef = canonical.into(); + + // If there are no schemes that we can compress into, then just return it uncompressed. + if eligible_schemes.is_empty() { + return Ok(array); + } + + // Nothing to compress if empty or all-null. + if array.is_empty() { + return Ok(array); + } + + if array.all_invalid()? { + return Ok( + ConstantArray::new(Scalar::null(array.dtype().clone()), array.len()).into_array(), + ); + } + + let before_nbytes = array.nbytes(); + let merged_opts = eligible_schemes + .iter() + .fold(GenerateStatsOptions::default(), |acc, s| { + acc.merge(s.stats_options()) + }); + + let ctx = ctx.with_stats_options(merged_opts); + + let mut data = ArrayAndStats::new(array, merged_opts); + + if let Some(winner) = self.choose_scheme(&eligible_schemes, &mut data, ctx.clone())? { + let compressed = winner.compress(self, &mut data, ctx)?; + if compressed.nbytes() < before_nbytes { + return Ok(compressed); + } + } + + // No scheme improved on the original. + Ok(data.into_array()) + } + + /// Calls [`expected_compression_ratio`] on each candidate and returns the scheme with the + /// highest ratio, or `None` if no scheme exceeds 1.0. Ties are broken by registration order + /// (earlier in the list wins). + /// + /// [`expected_compression_ratio`]: Scheme::expected_compression_ratio + fn choose_scheme( + &self, + schemes: &[&'static dyn Scheme], + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult> { + let mut best: Option<(&'static dyn Scheme, f64)> = None; + + for &scheme in schemes { + // Constant detection on a sample is a false positive: the sample being constant + // does not mean the full array is constant. + if ctx.is_sample() && scheme.detects_constant() { + continue; + } + + let ratio = scheme.expected_compression_ratio(self, data, ctx.clone())?; + + tracing::debug!(scheme = %scheme.id(), ratio, "evaluated compression ratio"); + + if is_better_ratio(ratio, &best) { + best = Some((scheme, ratio)); + + // Schemes that return f64::MAX (like Constant) cannot be beat, so stop early. + if ratio == f64::MAX { + break; + } + } + } + + Ok(best.map(|(s, _)| s)) + } + + // TODO(connor): Lots of room for optimization here. + /// Returns `true` if the candidate scheme should be excluded based on the cascade history and + /// exclusion rules. + fn is_excluded(&self, candidate: &dyn Scheme, ctx: &CompressorContext) -> bool { + let id = candidate.id(); + let history = ctx.cascade_history(); + + // Self-exclusion: no scheme appears twice in any chain. + if history.iter().any(|&(sid, _)| sid == id) { + return true; + } + + let mut iter = history.iter().copied().peekable(); + + // The root entry is always first in the history (if present). Check if the root has + // excluded us. + if let Some((_, child_idx)) = iter.next_if(|&(sid, _)| sid == ROOT_SCHEME_ID) + && self + .root_exclusions + .iter() + .any(|rule| rule.excluded == id && rule.children.contains(child_idx)) + { + return true; + } + + // Push rules: Check if any of our ancestors have excluded us. + for (ancestor_id, child_idx) in iter { + if let Some(ancestor) = self.schemes.iter().find(|s| s.id() == ancestor_id) + && ancestor + .descendant_exclusions() + .iter() + .any(|rule| rule.excluded == id && rule.children.contains(child_idx)) + { + return true; + } + } + + // Pull rules: Check if we have excluded ourselves because of our ancestors. + for rule in candidate.ancestor_exclusions() { + if history + .iter() + .any(|(sid, cidx)| *sid == rule.ancestor && rule.children.contains(*cidx)) + { + return true; + } + } + + false + } + + /// Compresses a [`ListArray`] by narrowing offsets and recursively compressing elements. + fn compress_list_array( + &self, + list_array: ListArray, + ctx: CompressorContext, + ) -> VortexResult { + let list_array = list_array.reset_offsets(true)?; + + let compressed_elems = self.compress(list_array.elements())?; + + // Record the root scheme with the offsets child index so root exclusion rules apply. + let offset_ctx = ctx.descend_with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), + offset_ctx, + )?; + + Ok(ListArray::try_new( + compressed_elems, + compressed_offsets, + list_array.validity().clone(), + )? + .into_array()) + } + + /// Compresses a [`ListViewArray`] by narrowing offsets/sizes and recursively compressing + /// elements. + fn compress_list_view_array( + &self, + list_view: ListViewArray, + ctx: CompressorContext, + ) -> VortexResult { + let compressed_elems = self.compress(list_view.elements())?; + + let offset_ctx = ctx + .clone() + .descend_with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_view.offsets().to_primitive().narrow()?), + offset_ctx, + )?; + + let sizes_ctx = ctx.descend_with_scheme(ROOT_SCHEME_ID, root_list_children::SIZES); + let compressed_sizes = self.compress_canonical( + Canonical::Primitive(list_view.sizes().to_primitive().narrow()?), + sizes_ctx, + )?; + + Ok(ListViewArray::try_new( + compressed_elems, + compressed_offsets, + compressed_sizes, + list_view.validity().clone(), + )? + .into_array()) + } +} + +/// Returns `true` if `ratio` is a valid compression ratio (> 1.0, finite, not subnormal) that +/// beats the current best. +fn is_better_ratio(ratio: f64, best: &Option<(&'static dyn Scheme, f64)>) -> bool { + ratio.is_finite() && !ratio.is_subnormal() && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builtins::FloatDictScheme; + use crate::builtins::IntDictScheme; + use crate::builtins::StringDictScheme; + use crate::ctx::CompressorContext; + use crate::scheme::SchemeExt; + + fn compressor() -> CascadingCompressor { + CascadingCompressor::new(vec![&IntDictScheme, &FloatDictScheme, &StringDictScheme]) + } + + #[test] + fn test_self_exclusion() { + let c = compressor(); + let ctx = CompressorContext::default().descend_with_scheme(IntDictScheme.id(), 0); + + // IntDictScheme is in the history, so it should be excluded. + assert!(c.is_excluded(&IntDictScheme, &ctx)); + } + + #[test] + fn test_root_exclusion_list_offsets() { + let c = compressor(); + let ctx = CompressorContext::default() + .descend_with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); + + // IntDict should be excluded for list offsets. + assert!(c.is_excluded(&IntDictScheme, &ctx)); + } + + #[test] + fn test_push_rule_float_dict_excludes_int_dict_from_codes() { + let c = compressor(); + // FloatDict cascading through codes (child 1). + let ctx = CompressorContext::default().descend_with_scheme(FloatDictScheme.id(), 1); + + // IntDict should be excluded from FloatDict's codes child. + assert!(c.is_excluded(&IntDictScheme, &ctx)); + } + + #[test] + fn test_push_rule_float_dict_excludes_int_dict_from_values() { + let c = compressor(); + // FloatDict cascading through values (child 0). + let ctx = CompressorContext::default().descend_with_scheme(FloatDictScheme.id(), 0); + + // IntDict should also be excluded from FloatDict's values child (ALP propagation + // replacement). + assert!(c.is_excluded(&IntDictScheme, &ctx)); + } + + #[test] + fn test_no_exclusion_without_history() { + let c = compressor(); + let ctx = CompressorContext::default(); + + // No history means no exclusions. + assert!(!c.is_excluded(&IntDictScheme, &ctx)); + } +} diff --git a/vortex-compressor/src/ctx.rs b/vortex-compressor/src/ctx.rs new file mode 100644 index 00000000000..465a7398350 --- /dev/null +++ b/vortex-compressor/src/ctx.rs @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression context for recursive compression. + +use vortex_error::VortexExpect; + +use crate::scheme::SchemeId; +use crate::stats::GenerateStatsOptions; + +// TODO(connor): Why is this 3??? This doesn't seem smart or adaptive. +/// Maximum cascade depth for compression. +pub const MAX_CASCADE: usize = 3; + +/// Context passed through recursive compression calls. +/// +/// Tracks the cascade history (which schemes and child indices have been applied in the current +/// chain) so the compressor can enforce exclusion rules and prevent cycles. +#[derive(Debug, Clone)] +pub struct CompressorContext { + /// Whether we're compressing a sample (for ratio estimation). + is_sample: bool, + /// Remaining cascade depth allowed. + allowed_cascading: usize, + /// Merged stats options from all eligible schemes at this compression site. + stats_options: GenerateStatsOptions, + /// The cascade chain: `(scheme_id, child_index)` pairs from root to current depth. + /// Used for self-exclusion, push rules ([`descendant_exclusions`]), and pull rules + /// ([`ancestor_exclusions`]). + /// + /// [`descendant_exclusions`]: crate::scheme::Scheme::descendant_exclusions + /// [`ancestor_exclusions`]: crate::scheme::Scheme::ancestor_exclusions + cascade_history: Vec<(SchemeId, usize)>, +} + +impl CompressorContext { + /// Creates a new `CompressorContext`. + /// + /// This should **only** be created by the compressor. + pub(super) fn new() -> Self { + Self { + is_sample: false, + allowed_cascading: MAX_CASCADE, + stats_options: GenerateStatsOptions::default(), + cascade_history: Vec::new(), + } + } +} + +#[cfg(test)] +impl Default for CompressorContext { + fn default() -> Self { + Self::new() + } +} + +impl CompressorContext { + /// Whether this context is for sample compression (ratio estimation). + pub fn is_sample(&self) -> bool { + self.is_sample + } + + /// Whether cascading is exhausted (no further cascade levels allowed). + pub fn finished_cascading(&self) -> bool { + self.allowed_cascading == 0 + } + + /// Returns the merged stats generation options for this compression site. + pub fn stats_options(&self) -> GenerateStatsOptions { + self.stats_options + } + + /// Returns a context with the given stats options. + pub fn with_stats_options(mut self, opts: GenerateStatsOptions) -> Self { + self.stats_options = opts; + self + } + + /// Returns a context marked as sample compression. + pub fn as_sample(mut self) -> Self { + self.is_sample = true; + self + } + + /// Returns a context that disallows further cascading. + pub fn as_leaf(mut self) -> Self { + self.allowed_cascading = 0; + self + } + + /// Descends one level in the cascade, recording the current scheme and which child is + /// being compressed. + /// + /// The `child_index` identifies which child of the scheme is being compressed (e.g. for + /// Dict: values=0, codes=1). + pub(crate) fn descend_with_scheme(mut self, id: SchemeId, child_index: usize) -> Self { + self.allowed_cascading = self + .allowed_cascading + .checked_sub(1) + .vortex_expect("cannot descend: cascade depth exhausted"); + self.cascade_history.push((id, child_index)); + self + } + + /// Returns the cascade chain of `(scheme_id, child_index)` pairs. + pub fn cascade_history(&self) -> &[(SchemeId, usize)] { + &self.cascade_history + } +} diff --git a/vortex-compressor/src/lib.rs b/vortex-compressor/src/lib.rs new file mode 100644 index 00000000000..683bea4f8aa --- /dev/null +++ b/vortex-compressor/src/lib.rs @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![deny(missing_docs)] +#![warn(clippy::missing_docs_in_private_items)] +#![warn(clippy::missing_errors_doc)] +#![warn(clippy::missing_panics_doc)] +#![warn(clippy::missing_safety_doc)] + +//! Encoding-agnostic compression framework for Vortex arrays. +//! +//! This crate provides the core compression engine: the [`Scheme`](scheme::Scheme) trait, +//! sampling-based ratio estimation, cascaded compression, and statistics infrastructure for +//! deciding the best encoding scheme for an array. +//! +//! This crate contains no encoding dependencies. Batteries-included compressors are provided by +//! downstream crates like `vortex-btrblocks`, which register different encodings to the compressor. + +pub mod builtins; +pub mod ctx; +pub mod scheme; +pub mod stats; + +mod sample; + +mod compressor; +pub use compressor::CascadingCompressor; diff --git a/vortex-compressor/src/sample.rs b/vortex-compressor/src/sample.rs new file mode 100644 index 00000000000..fe6cd5078a9 --- /dev/null +++ b/vortex-compressor/src/sample.rs @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Sampling utilities for compression ratio estimation. + +use rand::RngExt; +use rand::SeedableRng; +use rand::prelude::StdRng; +use vortex_array::ArrayRef; +use vortex_array::DynArray; +use vortex_array::IntoArray; +use vortex_array::arrays::ChunkedArray; +use vortex_error::VortexExpect; + +/// The size of each sampled run. +pub const SAMPLE_SIZE: u32 = 64; + +/// The number of sampled runs. +/// +/// # Warning +/// +/// The product of `SAMPLE_SIZE` and `SAMPLE_COUNT` should be (roughly) a multiple of 1024 so that +/// fastlanes bitpacking of sampled vectors does not introduce (large amounts of) padding. +pub const SAMPLE_COUNT: u32 = 16; + +/// Fixed seed for the sampling RNG, ensuring deterministic compression output. +const SAMPLE_SEED: u64 = 1234567890; + +/// Samples approximately 1% of the input array for compression ratio estimation. +pub(crate) fn sample(input: &ArrayRef, sample_size: u32, sample_count: u32) -> ArrayRef { + if input.len() <= (sample_size as usize) * (sample_count as usize) { + return input.to_array(); + } + + let slices = stratified_slices( + input.len(), + sample_size, + sample_count, + &mut StdRng::seed_from_u64(SAMPLE_SEED), + ); + + // For every slice, grab the relevant slice and repack into a new PrimitiveArray. + let chunks: Vec<_> = slices + .into_iter() + .map(|(start, end)| { + input + .slice(start..end) + .vortex_expect("slice should succeed") + }) + .collect(); + // SAFETY: all chunks are slices of `input`, so they share its dtype. + unsafe { ChunkedArray::new_unchecked(chunks, input.dtype().clone()) }.into_array() +} + +/// Computes the number of sample chunks to cover approximately 1% of `len` elements, +/// with a minimum of `SAMPLE_SIZE * SAMPLE_COUNT` (1024) values. +pub(crate) fn sample_count_approx_one_percent(len: usize) -> u32 { + let approximately_one_percent = + (len / 100) / usize::try_from(SAMPLE_SIZE).vortex_expect("SAMPLE_SIZE must fit in usize"); + u32::max( + u32::next_multiple_of( + approximately_one_percent + .try_into() + .vortex_expect("sample count must fit in u32"), + 16, + ), + SAMPLE_COUNT, + ) +} + +/// Divides an array into `sample_count` equal partitions and picks one random contiguous +/// slice of `sample_size` elements from each partition. +/// +/// This is a stratified sampling strategy: instead of drawing all samples from one region, +/// it spreads them evenly across the array so that every part of the data is represented. +/// Each returned `(start, end)` pair is a half-open range into the original array. +/// +/// If the total number of requested samples (`sample_size * sample_count`) is greater than or +/// equal to `length`, a single slice spanning the whole array is returned. +fn stratified_slices( + length: usize, + sample_size: u32, + sample_count: u32, + rng: &mut StdRng, +) -> Vec<(usize, usize)> { + let total_num_samples: usize = (sample_count as usize) * (sample_size as usize); + if total_num_samples >= length { + return vec![(0usize, length)]; + } + + let partitions = partition_indices(length, sample_count); + let num_samples_per_partition: Vec = partition_indices(total_num_samples, sample_count) + .into_iter() + .map(|(start, stop)| stop - start) + .collect(); + + partitions + .into_iter() + .zip(num_samples_per_partition) + .map(|((start, stop), size)| { + assert!( + stop - start >= size, + "Slices must be bigger than their sampled size" + ); + let random_start = rng.random_range(start..=(stop - size)); + (random_start, random_start + size) + }) + .collect() +} + +/// Splits `[0, length)` into `num_partitions` contiguous, non-overlapping slices of +/// approximately equal size. +/// +/// If `length` is not evenly divisible by `num_partitions`, the first +/// `length % num_partitions` slices get one extra element. Each returned `(start, end)` pair +/// is a half-open range. +fn partition_indices(length: usize, num_partitions: u32) -> Vec<(usize, usize)> { + let num_long_parts = length % num_partitions as usize; + let short_step = length / num_partitions as usize; + let long_step = short_step + 1; + let long_stop = num_long_parts * long_step; + + (0..long_stop) + .step_by(long_step) + .map(|off| (off, off + long_step)) + .chain( + (long_stop..length) + .step_by(short_step) + .map(|off| (off, off + short_step)), + ) + .collect() +} + +#[cfg(test)] +mod tests { + use vortex_array::IntoArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_error::VortexResult; + + use super::*; + + #[test] + fn sample_is_deterministic() -> VortexResult<()> { + // Create a deterministic array with linear-with-noise pattern + let values: Vec = (0i64..100_000).map(|i| i + (i * 7 + 3) % 11).collect(); + + let array = + PrimitiveArray::new(Buffer::from_iter(values), Validity::NonNullable).into_array(); + + let first = sample(&array, SAMPLE_SIZE, SAMPLE_COUNT); + for _ in 0..10 { + let again = sample(&array, SAMPLE_SIZE, SAMPLE_COUNT); + assert_eq!(first.nbytes(), again.nbytes()); + assert_arrays_eq!(&first, &again); + } + Ok(()) + } +} diff --git a/vortex-compressor/src/scheme.rs b/vortex-compressor/src/scheme.rs new file mode 100644 index 00000000000..dab34a778eb --- /dev/null +++ b/vortex-compressor/src/scheme.rs @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Unified compression scheme trait and exclusion rules. + +use std::fmt; +use std::fmt::Debug; +use std::hash::Hash; +use std::hash::Hasher; + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_error::VortexResult; + +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::sample::SAMPLE_SIZE; +use crate::sample::sample; +use crate::sample::sample_count_approx_one_percent; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// Unique identifier for a compression scheme. +/// +/// The only way to obtain a [`SchemeId`] is through [`SchemeExt::id()`], which is +/// auto-implemented for all [`Scheme`] types. There is no public constructor. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SchemeId { + /// Only constructable within `vortex-compressor`. + /// + /// The only public way to obtain a [`SchemeId`] is through [`SchemeExt::id()`]. + pub(super) name: &'static str, +} + +impl fmt::Display for SchemeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.name) + } +} + +/// Selects which children of a cascading scheme a rule applies to. +#[derive(Debug, Clone, Copy)] +pub enum ChildSelection { + /// Rule applies to all children. + All, + /// Rule applies to a single child. + One(usize), + /// Rule applies to multiple specific children. + Many(&'static [usize]), +} + +impl ChildSelection { + /// Returns `true` if this selection includes the given child index. + pub fn contains(&self, child_index: usize) -> bool { + match self { + ChildSelection::All => true, + ChildSelection::One(idx) => *idx == child_index, + ChildSelection::Many(indices) => indices.contains(&child_index), + } + } +} + +/// Push rule: declared by a cascading scheme to exclude another scheme from the subtree +/// rooted at the specified children. +/// +/// Use this when the declaring scheme (the ancestor) knows about the excluded scheme. For example, +/// `ZigZag` excludes `Dict` from all its children. +#[derive(Debug, Clone, Copy)] +pub struct DescendantExclusion { + /// The scheme to exclude from descendants. + pub excluded: SchemeId, + /// Which children of the declaring scheme this rule applies to. + pub children: ChildSelection, +} + +/// Pull rule: declared by a scheme to exclude itself when the specified ancestor is in the +/// cascade chain. +/// +/// Use this when the excluded scheme (the descendant) knows about the ancestor. For example, +/// `Sequence` excludes itself when `IntDict` is an ancestor on its codes child. +#[derive(Debug, Clone, Copy)] +pub struct AncestorExclusion { + /// The ancestor scheme that makes the declaring scheme ineligible. + pub ancestor: SchemeId, + /// Which children of the ancestor this rule applies to. + pub children: ChildSelection, +} + +/// A single compression encoding that the [`CascadingCompressor`] can select from. +/// +/// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a +/// given array, picks the one with the highest [`expected_compression_ratio`], and calls +/// [`compress`] on the winner. +/// +/// One of the key features of this compressor is that schemes may "cascade": a scheme's +/// [`compress`] can call back into the compressor via [`CascadingCompressor::compress_child`] to +/// compress child or transformed arrays, building up multiple encoding layers (e.g. +/// frame-of-reference and then bit-packing). +/// +/// # Identity +/// +/// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`] +/// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] used +/// for equality, hashing, and exclusion rules. +/// +/// # Cascading and children +/// +/// Schemes that produce child arrays for further compression declare [`num_children`] > 0. Each +/// child is identified by index. Cascading schemes should use +/// [`CascadingCompressor::compress_child`] to compress each child array, which handles cascade +/// level / budget tracking and context management automatically. +/// +/// No scheme may appear twice in a cascade chain (enforced by the compressor). This keeps the +/// search space a tree. +/// +/// # Exclusion rules +/// +/// Schemes declare exclusion rules to prevent incompatible scheme combinations in the cascade +/// chain: +/// +/// - [`descendant_exclusions`] (push): "exclude scheme X from my child Y's subtree." Used when the +/// declaring scheme knows about the excluded scheme. +/// - [`ancestor_exclusions`] (pull): "exclude me if ancestor X's child Y is above me." Used when +/// the declaring scheme knows about the ancestor. +/// +/// # Implementing a scheme +/// +/// At a minimum, implementors must provide [`scheme_name`], [`matches`], and [`compress`]. +/// +/// The default [`expected_compression_ratio`] estimates the ratio by compressing a small sample. +/// Implementors should only override this method when a cheaper heuristic is available (e.g. +/// returning `f64::MAX` for constant detection or `0.0` for early rejection based on stats). +/// +/// Schemes that need statistics that may be expensive to compute should override [`stats_options`] +/// to declare what they require. The compressor merges all eligible schemes' options before +/// generating stats, so each stat is always computed at most once for a given array. +/// +/// [`scheme_name`]: Scheme::scheme_name +/// [`matches`]: Scheme::matches +/// [`compress`]: Scheme::compress +/// [`expected_compression_ratio`]: Scheme::expected_compression_ratio +/// [`stats_options`]: Scheme::stats_options +/// [`num_children`]: Scheme::num_children +/// [`descendant_exclusions`]: Scheme::descendant_exclusions +/// [`ancestor_exclusions`]: Scheme::ancestor_exclusions +pub trait Scheme: Debug + Send + Sync { + /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`). + fn scheme_name(&self) -> &'static str; + + /// Whether this scheme can compress the given canonical array. + fn matches(&self, canonical: &Canonical) -> bool; + + /// True if this scheme detects constant arrays. + fn detects_constant(&self) -> bool { + false + } + + /// Returns the stats generation options this scheme requires. The compressor merges all + /// eligible schemes' options before generating stats so that a single stats pass satisfies + /// every scheme. + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions::default() + } + + /// The number of child arrays this scheme produces when cascading. Returns 0 for leaf + /// schemes that produce a final encoded array. + fn num_children(&self) -> usize { + 0 + } + + /// Schemes to exclude from specific children's subtrees (push direction). + /// + /// Each rule says: "when I cascade through child Y, do not use scheme X anywhere in that + /// subtree." Only meaningful when [`num_children`](Scheme::num_children) > 0. + fn descendant_exclusions(&self) -> Vec { + Vec::new() + } + + /// Ancestors that make this scheme ineligible (pull direction). + /// + /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, do + /// not try me." + fn ancestor_exclusions(&self) -> Vec { + Vec::new() + } + + // TODO(connor): It would be nice if we returned a more useful type that said "choose me no + // matter what" instead of `f64::MAX`. + /// Estimate the compression ratio for this scheme on the given array. + /// + /// # Errors + /// + /// Returns an error if compression of the sample fails. + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + /// Compress the array using this scheme. + /// + /// # Errors + /// + /// Returns an error if compression fails. + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult; +} + +impl PartialEq for dyn Scheme { + fn eq(&self, other: &Self) -> bool { + self.id() == other.id() + } +} + +impl Eq for dyn Scheme {} + +impl Hash for dyn Scheme { + fn hash(&self, state: &mut H) { + self.id().hash(state); + } +} + +/// Extension trait providing [`id`](SchemeExt::id) for all [`Scheme`] implementors. +/// +/// This trait is automatically implemented for every type that implements [`Scheme`]. Because the +/// blanket implementation covers all types, external crates cannot override `id()`. +pub trait SchemeExt: Scheme { + /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name). + fn id(&self) -> SchemeId { + SchemeId { + name: self.scheme_name(), + } + } +} + +impl SchemeExt for T {} + +/// Estimates compression ratio by compressing a ~1% sample of the data. +/// +/// Creates a new [`ArrayAndStats`] for the sample so that stats are generated from the sample, not +/// the full array. +/// +/// # Errors +/// +/// Returns an error if sample compression fails. +pub fn estimate_compression_ratio_with_sampling( + scheme: &S, + compressor: &CascadingCompressor, + array: &ArrayRef, + ctx: CompressorContext, +) -> VortexResult { + let sample_array = if ctx.is_sample() { + array.clone() + } else { + let source_len = array.len(); + let sample_count = sample_count_approx_one_percent(source_len); + + tracing::trace!( + "Sampling {} values out of {}", + SAMPLE_SIZE as u64 * sample_count as u64, + source_len + ); + + sample(array, SAMPLE_SIZE, sample_count) + }; + + let mut sample_data = ArrayAndStats::new(sample_array, ctx.stats_options()); + let sample_ctx = ctx.as_sample(); + + let after = scheme + .compress(compressor, &mut sample_data, sample_ctx)? + .nbytes(); + let before = sample_data.array().nbytes(); + let ratio = before as f64 / after as f64; + + tracing::debug!("estimate_compression_ratio_with_sampling(compressor={scheme:#?}) = {ratio}",); + + Ok(ratio) +} diff --git a/vortex-compressor/src/stats/cache.rs b/vortex-compressor/src/stats/cache.rs new file mode 100644 index 00000000000..bbb6522337f --- /dev/null +++ b/vortex-compressor/src/stats/cache.rs @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Per-compression-site statistics cache and the [`ArrayAndStats`] bundle. + +use std::any::Any; +use std::any::TypeId; + +use vortex_array::ArrayRef; +use vortex_array::ToCanonical; +use vortex_error::VortexExpect; + +use super::FloatStats; +use super::GenerateStatsOptions; +use super::IntegerStats; +use super::StringStats; + +/// Cache for compression statistics, keyed by concrete type. +struct StatsCache { + // TODO(connor): We could further optimize this with a `SmallVec` here. + /// The cache entries, keyed by [`TypeId`]. + /// + /// The total number of statistics types in this stats should be relatively small, so we use a + /// vector instead of a hash map. + entries: Vec<(TypeId, Box)>, +} + +impl StatsCache { + /// Creates a new empty cache. + fn new() -> Self { + Self { + entries: Vec::new(), + } + } + + /// Returns a cached value, computing it on first access. + fn get_or_insert_with(&mut self, f: impl FnOnce() -> T) -> &T { + let type_id = TypeId::of::(); + let pos = self.entries.iter().position(|(id, _)| *id == type_id); + + if let Some(pos) = pos { + self.entries[pos] + .1 + .downcast_ref::() + .vortex_expect("we just checked the TypeID") + } else { + self.entries.push((type_id, Box::new(f()))); + self.entries + .last() + .vortex_expect("just pushed") + .1 + .downcast_ref::() + .vortex_expect("we just checked the TypeID") + } + } +} + +/// An array bundled with its lazily-computed statistics cache. +/// +/// The cache is guaranteed to correspond to the array. When a scheme creates a derived array (e.g. +/// FoR bias subtraction), it must create a new [`ArrayAndStats`] so that stale stats from the +/// original array are not reused. +/// +/// Built-in stats are accessed via typed methods (`integer_stats`, `float_stats`, `string_stats`) +/// which generate stats lazily on first access using the stored [`GenerateStatsOptions`]. +/// +/// Extension schemes can use `get_or_insert_with` for custom stats types. +pub struct ArrayAndStats { + /// The array. + array: ArrayRef, + /// The stats cache. + cache: StatsCache, + /// The stats generation options. + opts: GenerateStatsOptions, +} + +impl ArrayAndStats { + /// Creates a new bundle with the given stats generation options. + /// + /// Stats are generated lazily on first access via the typed accessor methods. + pub fn new(array: ArrayRef, opts: GenerateStatsOptions) -> Self { + Self { + array, + cache: StatsCache::new(), + opts, + } + } + + /// Returns a reference to the array. + pub fn array(&self) -> &ArrayRef { + &self.array + } + + /// Consumes the bundle and returns the array. + pub fn into_array(self) -> ArrayRef { + self.array + } + + /// Returns integer stats, generating them lazily on first access. + pub fn integer_stats(&mut self) -> &IntegerStats { + let array = self.array.clone(); + let opts = self.opts; + + self.cache.get_or_insert_with::(|| { + IntegerStats::generate_opts(&array.to_primitive(), opts) + }) + } + + /// Returns float stats, generating them lazily on first access. + pub fn float_stats(&mut self) -> &FloatStats { + let array = self.array.clone(); + let opts = self.opts; + + self.cache.get_or_insert_with::(|| { + FloatStats::generate_opts(&array.to_primitive(), opts) + }) + } + + /// Returns string stats, generating them lazily on first access. + pub fn string_stats(&mut self) -> &StringStats { + let array = self.array.clone(); + let opts = self.opts; + + self.cache.get_or_insert_with::(|| { + StringStats::generate_opts(&array.to_varbinview(), opts) + }) + } + + /// For extension schemes with custom stats types. + pub fn get_or_insert_with(&mut self, f: impl FnOnce() -> T) -> &T { + self.cache.get_or_insert_with::(f) + } +} diff --git a/vortex-compressor/src/stats/float.rs b/vortex-compressor/src/stats/float.rs new file mode 100644 index 00000000000..67877d7796c --- /dev/null +++ b/vortex-compressor/src/stats/float.rs @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Float compression statistics. + +use std::hash::Hash; + +use itertools::Itertools; +use num_traits::Float; +use rustc_hash::FxBuildHasher; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::primitive::NativeValue; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::PType; +use vortex_array::dtype::half::f16; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_error::vortex_panic; +use vortex_mask::AllOr; +use vortex_utils::aliases::hash_set::HashSet; + +use super::GenerateStatsOptions; + +/// Information about the distinct values in a float array. +#[derive(Debug, Clone)] +pub struct DistinctInfo { + /// The set of distinct float values. + distinct_values: HashSet, FxBuildHasher>, + /// The count of unique values. + distinct_count: u32, +} + +impl DistinctInfo { + /// Returns a reference to the distinct values set. + pub fn distinct_values(&self) -> &HashSet, FxBuildHasher> { + &self.distinct_values + } +} + +/// Typed statistics for a specific float type. +#[derive(Debug, Clone)] +pub struct TypedStats { + /// Distinct value information, or `None` if not computed. + distinct: Option>, +} + +impl TypedStats { + /// Returns the distinct value information, if computed. + pub fn distinct(&self) -> Option<&DistinctInfo> { + self.distinct.as_ref() + } +} + +/// Type-erased container for one of the [`TypedStats`] variants. +#[derive(Debug, Clone)] +pub enum ErasedStats { + /// Stats for `f16` arrays. + F16(TypedStats), + /// Stats for `f32` arrays. + F32(TypedStats), + /// Stats for `f64` arrays. + F64(TypedStats), +} + +impl ErasedStats { + /// Get the count of distinct values, if we have computed it already. + fn distinct_count(&self) -> Option { + match self { + ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count), + ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count), + ErasedStats::F64(x) => x.distinct.as_ref().map(|d| d.distinct_count), + } + } +} + +/// Implements `From>` for [`ErasedStats`]. +macro_rules! impl_from_typed { + ($T:ty, $variant:path) => { + impl From> for ErasedStats { + fn from(typed: TypedStats<$T>) -> Self { + $variant(typed) + } + } + }; +} + +impl_from_typed!(f16, ErasedStats::F16); +impl_from_typed!(f32, ErasedStats::F32); +impl_from_typed!(f64, ErasedStats::F64); + +/// Array of floating-point numbers and relevant stats for compression. +#[derive(Debug, Clone)] +pub struct FloatStats { + /// The underlying source array. + src: PrimitiveArray, + /// Cache for `validity.false_count()`. + null_count: u32, + /// Cache for `validity.true_count()`. + value_count: u32, + /// The average run length. + average_run_length: u32, + /// Type-erased typed statistics. + erased: ErasedStats, +} + +impl FloatStats { + /// Generates stats, returning an error on failure. + fn generate_opts_fallible( + input: &PrimitiveArray, + opts: GenerateStatsOptions, + ) -> VortexResult { + match input.ptype() { + PType::F16 => typed_float_stats::(input, opts.count_distinct_values), + PType::F32 => typed_float_stats::(input, opts.count_distinct_values), + PType::F64 => typed_float_stats::(input, opts.count_distinct_values), + _ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()), + } + } + + /// Get the count of distinct values, if we have computed it already. + pub fn distinct_count(&self) -> Option { + self.erased.distinct_count() + } +} + +impl FloatStats { + /// Generates stats with default options. + pub fn generate(input: &PrimitiveArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } + + /// Generates stats with provided options. + pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { + Self::generate_opts_fallible(input, opts) + .vortex_expect("FloatStats::generate_opts should not fail") + } + + /// Returns the underlying source array. + pub fn source(&self) -> &PrimitiveArray { + &self.src + } + + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { + self.value_count + } + + /// Returns the average run length. + pub fn average_run_length(&self) -> u32 { + self.average_run_length + } + + /// Returns the type-erased typed statistics. + pub fn erased(&self) -> &ErasedStats { + &self.erased + } +} + +/// Computes typed float statistics for a specific float type. +fn typed_float_stats( + array: &PrimitiveArray, + count_distinct_values: bool, +) -> VortexResult +where + NativeValue: Hash + Eq, + TypedStats: Into, +{ + // Special case: empty array. + if array.is_empty() { + return Ok(FloatStats { + src: array.clone(), + null_count: 0, + value_count: 0, + average_run_length: 0, + erased: TypedStats { distinct: None }.into(), + }); + } else if array.all_invalid()? { + return Ok(FloatStats { + src: array.clone(), + null_count: u32::try_from(array.len())?, + value_count: 0, + average_run_length: 0, + erased: TypedStats { distinct: None }.into(), + }); + } + + let null_count = array + .statistics() + .compute_null_count() + .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; + let value_count = array.len() - null_count; + + // Keep a HashMap of T, then convert the keys into PValue afterward since value is + // so much more efficient to hash and search for. + let mut distinct_values = if count_distinct_values { + HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) + } else { + HashSet::with_hasher(FxBuildHasher) + }; + + let validity = array.validity_mask()?; + + let mut runs = 1; + let head_idx = validity + .first() + .vortex_expect("All null masks have been handled before"); + let buff = array.to_buffer::(); + let mut prev = buff[head_idx]; + + let first_valid_buff = buff.slice(head_idx..array.len()); + match validity.bit_buffer() { + AllOr::All => { + for value in first_valid_buff { + if count_distinct_values { + distinct_values.insert(NativeValue(value)); + } + + if value != prev { + prev = value; + runs += 1; + } + } + } + AllOr::None => unreachable!("All invalid arrays have been handled earlier"), + AllOr::Some(v) => { + for (&value, valid) in first_valid_buff + .iter() + .zip_eq(v.slice(head_idx..array.len()).iter()) + { + if valid { + if count_distinct_values { + distinct_values.insert(NativeValue(value)); + } + + if value != prev { + prev = value; + runs += 1; + } + } + } + } + } + + let null_count = u32::try_from(null_count)?; + let value_count = u32::try_from(value_count)?; + + let distinct = count_distinct_values.then(|| DistinctInfo { + distinct_count: u32::try_from(distinct_values.len()) + .vortex_expect("more than u32::MAX distinct values"), + distinct_values, + }); + + Ok(FloatStats { + null_count, + value_count, + src: array.clone(), + average_run_length: value_count / runs, + erased: TypedStats { distinct }.into(), + }) +} + +#[cfg(test)] +mod tests { + use vortex_array::IntoArray; + use vortex_array::ToCanonical; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + + use super::FloatStats; + + #[test] + fn test_float_stats() { + let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array(); + let floats = floats.to_primitive(); + + let stats = FloatStats::generate_opts( + &floats, + crate::stats::GenerateStatsOptions { + count_distinct_values: true, + }, + ); + + assert_eq!(stats.value_count, 3); + assert_eq!(stats.null_count, 0); + assert_eq!(stats.average_run_length, 1); + assert_eq!(stats.distinct_count().unwrap(), 3); + } + + #[test] + fn test_float_stats_leading_nulls() { + let floats = PrimitiveArray::new( + buffer![0.0f32, 1.0f32, 2.0f32], + Validity::from_iter([false, true, true]), + ); + + let stats = FloatStats::generate_opts( + &floats, + crate::stats::GenerateStatsOptions { + count_distinct_values: true, + }, + ); + + assert_eq!(stats.value_count, 2); + assert_eq!(stats.null_count, 1); + assert_eq!(stats.average_run_length, 1); + assert_eq!(stats.distinct_count().unwrap(), 2); + } +} diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs new file mode 100644 index 00000000000..1f13118584b --- /dev/null +++ b/vortex-compressor/src/stats/integer.rs @@ -0,0 +1,622 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Integer compression statistics. + +use std::hash::Hash; + +use num_traits::PrimInt; +use rustc_hash::FxBuildHasher; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::primitive::NativeValue; +use vortex_array::dtype::IntegerPType; +use vortex_array::expr::stats::Stat; +use vortex_array::match_each_integer_ptype; +use vortex_array::scalar::PValue; +use vortex_array::scalar::Scalar; +use vortex_buffer::BitBuffer; +use vortex_error::VortexError; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_mask::AllOr; +use vortex_utils::aliases::hash_map::HashMap; + +use super::GenerateStatsOptions; + +/// Information about the distinct values in an integer array. +#[derive(Debug, Clone)] +pub struct DistinctInfo { + /// The unique values and their occurrences. + distinct_values: HashMap, u32, FxBuildHasher>, + /// The count of unique values. + distinct_count: u32, + /// The most frequent value. + most_frequent_value: T, + /// The number of times the most frequent value occurs. + top_frequency: u32, +} + +impl DistinctInfo { + /// Returns a reference to the distinct values map. + pub fn distinct_values(&self) -> &HashMap, u32, FxBuildHasher> { + &self.distinct_values + } +} + +/// Typed statistics for a specific integer type. +#[derive(Debug, Clone)] +pub struct TypedStats { + /// The minimum value. + min: T, + /// The maximum value. + max: T, + /// Distinct value information, or `None` if not computed. + distinct: Option>, +} + +impl TypedStats { + /// Returns the distinct value information, if computed. + pub fn distinct(&self) -> Option<&DistinctInfo> { + self.distinct.as_ref() + } +} + +impl TypedStats { + /// Get the count of distinct values, if we have computed it already. + fn distinct_count(&self) -> Option { + Some(self.distinct.as_ref()?.distinct_count) + } + + /// Get the most commonly occurring value and its count, if we have computed it already. + fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> { + let distinct = self.distinct.as_ref()?; + Some((&distinct.most_frequent_value, distinct.top_frequency)) + } +} + +/// Type-erased container for one of the [`TypedStats`] variants. +/// +/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased +/// set of stats. We then perform a variety of access methods on them. +#[derive(Clone, Debug)] +pub enum ErasedStats { + /// Stats for `u8` arrays. + U8(TypedStats), + /// Stats for `u16` arrays. + U16(TypedStats), + /// Stats for `u32` arrays. + U32(TypedStats), + /// Stats for `u64` arrays. + U64(TypedStats), + /// Stats for `i8` arrays. + I8(TypedStats), + /// Stats for `i16` arrays. + I16(TypedStats), + /// Stats for `i32` arrays. + I32(TypedStats), + /// Stats for `i64` arrays. + I64(TypedStats), +} + +impl ErasedStats { + /// Returns `true` if the minimum value is zero. + pub fn min_is_zero(&self) -> bool { + match &self { + ErasedStats::U8(x) => x.min == 0, + ErasedStats::U16(x) => x.min == 0, + ErasedStats::U32(x) => x.min == 0, + ErasedStats::U64(x) => x.min == 0, + ErasedStats::I8(x) => x.min == 0, + ErasedStats::I16(x) => x.min == 0, + ErasedStats::I32(x) => x.min == 0, + ErasedStats::I64(x) => x.min == 0, + } + } + + /// Returns `true` if the minimum value is negative. + pub fn min_is_negative(&self) -> bool { + match &self { + ErasedStats::U8(_) + | ErasedStats::U16(_) + | ErasedStats::U32(_) + | ErasedStats::U64(_) => false, + ErasedStats::I8(x) => x.min < 0, + ErasedStats::I16(x) => x.min < 0, + ErasedStats::I32(x) => x.min < 0, + ErasedStats::I64(x) => x.min < 0, + } + } + + /// Difference between max and min. + pub fn max_minus_min(&self) -> u64 { + match &self { + ErasedStats::U8(x) => (x.max - x.min) as u64, + ErasedStats::U16(x) => (x.max - x.min) as u64, + ErasedStats::U32(x) => (x.max - x.min) as u64, + ErasedStats::U64(x) => x.max - x.min, + ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64, + ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64, + ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64, + ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128) + .vortex_expect("max minus min result bigger than u64"), + } + } + + /// Returns the ilog2 of the max value when transmuted to unsigned, or `None` if zero. + /// + /// This matches how BitPacking computes bit width: it reinterprets signed values as + /// unsigned (preserving bit pattern) and uses `leading_zeros`. For non-negative signed + /// values, the transmuted value equals the original value. + /// + /// This is used to determine if FOR encoding would reduce bit width compared to + /// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help. + pub fn max_ilog2(&self) -> Option { + match &self { + ErasedStats::U8(x) => x.max.checked_ilog2(), + ErasedStats::U16(x) => x.max.checked_ilog2(), + ErasedStats::U32(x) => x.max.checked_ilog2(), + ErasedStats::U64(x) => x.max.checked_ilog2(), + // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior. + ErasedStats::I8(x) => (x.max as u8).checked_ilog2(), + ErasedStats::I16(x) => (x.max as u16).checked_ilog2(), + ErasedStats::I32(x) => (x.max as u32).checked_ilog2(), + ErasedStats::I64(x) => (x.max as u64).checked_ilog2(), + } + } + + /// Get the count of distinct values, if we have computed it already. + pub fn distinct_count(&self) -> Option { + match &self { + ErasedStats::U8(x) => x.distinct_count(), + ErasedStats::U16(x) => x.distinct_count(), + ErasedStats::U32(x) => x.distinct_count(), + ErasedStats::U64(x) => x.distinct_count(), + ErasedStats::I8(x) => x.distinct_count(), + ErasedStats::I16(x) => x.distinct_count(), + ErasedStats::I32(x) => x.distinct_count(), + ErasedStats::I64(x) => x.distinct_count(), + } + } + + /// Get the most commonly occurring value and its count. + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { + match &self { + ErasedStats::U8(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::U16(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::U32(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::U64(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I8(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I16(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I32(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I64(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + } + } +} + +/// Implements `From>` for [`ErasedStats`]. +macro_rules! impl_from_typed { + ($T:ty, $variant:path) => { + impl From> for ErasedStats { + fn from(typed: TypedStats<$T>) -> Self { + $variant(typed) + } + } + }; +} + +impl_from_typed!(u8, ErasedStats::U8); +impl_from_typed!(u16, ErasedStats::U16); +impl_from_typed!(u32, ErasedStats::U32); +impl_from_typed!(u64, ErasedStats::U64); +impl_from_typed!(i8, ErasedStats::I8); +impl_from_typed!(i16, ErasedStats::I16); +impl_from_typed!(i32, ErasedStats::I32); +impl_from_typed!(i64, ErasedStats::I64); + +/// Array of integers and relevant stats for compression. +#[derive(Clone, Debug)] +pub struct IntegerStats { + /// The underlying source array. + src: PrimitiveArray, + /// Cache for `validity.false_count()`. + null_count: u32, + /// Cache for `validity.true_count()`. + value_count: u32, + /// The average run length. + average_run_length: u32, + /// Type-erased typed statistics. + erased: ErasedStats, +} + +impl IntegerStats { + /// Generates stats, returning an error on failure. + fn generate_opts_fallible( + input: &PrimitiveArray, + opts: GenerateStatsOptions, + ) -> VortexResult { + match_each_integer_ptype!(input.ptype(), |T| { + typed_int_stats::(input, opts.count_distinct_values) + }) + } + + /// Get the count of distinct values, if we have computed it already. + pub fn distinct_count(&self) -> Option { + self.erased.distinct_count() + } + + /// Get the most commonly occurring value and its count, if we have computed it already. + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { + self.erased.most_frequent_value_and_count() + } +} + +impl IntegerStats { + /// Generates stats with default options. + pub fn generate(input: &PrimitiveArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } + + /// Generates stats with provided options. + pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { + Self::generate_opts_fallible(input, opts) + .vortex_expect("IntegerStats::generate_opts should not fail") + } + + /// Returns the underlying source array. + pub fn source(&self) -> &PrimitiveArray { + &self.src + } + + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { + self.value_count + } + + /// Returns the average run length. + pub fn average_run_length(&self) -> u32 { + self.average_run_length + } + + /// Returns the type-erased typed statistics. + pub fn erased(&self) -> &ErasedStats { + &self.erased + } +} + +/// Computes typed integer statistics for a specific integer type. +fn typed_int_stats( + array: &PrimitiveArray, + count_distinct_values: bool, +) -> VortexResult +where + T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>, + TypedStats: Into, + NativeValue: Eq + Hash, +{ + // Special case: empty array. + if array.is_empty() { + return Ok(IntegerStats { + src: array.clone(), + null_count: 0, + value_count: 0, + average_run_length: 0, + erased: TypedStats { + min: T::max_value(), + max: T::min_value(), + distinct: None, + } + .into(), + }); + } else if array.all_invalid()? { + return Ok(IntegerStats { + src: array.clone(), + null_count: u32::try_from(array.len())?, + value_count: 0, + average_run_length: 0, + erased: TypedStats { + min: T::max_value(), + max: T::min_value(), + distinct: None, + } + .into(), + }); + } + + let validity = array.validity_mask()?; + let null_count = validity.false_count(); + let value_count = validity.true_count(); + + // Initialize loop state. + let head_idx = validity + .first() + .vortex_expect("All null masks have been handled before"); + let buffer = array.to_buffer::(); + let head = buffer[head_idx]; + + let mut loop_state = LoopState { + distinct_values: if count_distinct_values { + HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) + } else { + HashMap::with_hasher(FxBuildHasher) + }, + prev: head, + runs: 1, + }; + + let sliced = buffer.slice(head_idx..array.len()); + let mut chunks = sliced.as_slice().chunks_exact(64); + match validity.bit_buffer() { + AllOr::All => { + for chunk in &mut chunks { + inner_loop_nonnull( + chunk.try_into().ok().vortex_expect("chunk size must be 64"), + count_distinct_values, + &mut loop_state, + ) + } + let remainder = chunks.remainder(); + inner_loop_naive( + remainder, + count_distinct_values, + &BitBuffer::new_set(remainder.len()), + &mut loop_state, + ); + } + AllOr::None => unreachable!("All invalid arrays have been handled before"), + AllOr::Some(v) => { + let mask = v.slice(head_idx..array.len()); + let mut offset = 0; + for chunk in &mut chunks { + let validity = mask.slice(offset..(offset + 64)); + offset += 64; + + match validity.true_count() { + // All nulls -> no stats to update. + 0 => continue, + // Inner loop for when validity check can be elided. + 64 => inner_loop_nonnull( + chunk.try_into().ok().vortex_expect("chunk size must be 64"), + count_distinct_values, + &mut loop_state, + ), + // Inner loop for when we need to check validity. + _ => inner_loop_nullable( + chunk.try_into().ok().vortex_expect("chunk size must be 64"), + count_distinct_values, + &validity, + &mut loop_state, + ), + } + } + // Final iteration, run naive loop. + let remainder = chunks.remainder(); + inner_loop_naive( + remainder, + count_distinct_values, + &mask.slice(offset..(offset + remainder.len())), + &mut loop_state, + ); + } + } + + let runs = loop_state.runs; + + let min = array + .statistics() + .compute_as::(Stat::Min) + .vortex_expect("min should be computed"); + + let max = array + .statistics() + .compute_as::(Stat::Max) + .vortex_expect("max should be computed"); + + let distinct = count_distinct_values.then(|| { + let (&top_value, &top_count) = loop_state + .distinct_values + .iter() + .max_by_key(|&(_, &count)| count) + .vortex_expect("we know this is non-empty"); + + DistinctInfo { + distinct_count: u32::try_from(loop_state.distinct_values.len()) + .vortex_expect("there are more than `u32::MAX` distinct values"), + most_frequent_value: top_value.0, + top_frequency: top_count, + distinct_values: loop_state.distinct_values, + } + }); + + let typed = TypedStats { min, max, distinct }; + + let null_count = u32::try_from(null_count)?; + let value_count = u32::try_from(value_count)?; + + Ok(IntegerStats { + src: array.clone(), + null_count, + value_count, + average_run_length: value_count / runs, + erased: typed.into(), + }) +} + +/// Internal loop state for integer stats computation. +struct LoopState { + /// The previous value seen. + prev: T, + /// The run count. + runs: u32, + /// The distinct values map. + distinct_values: HashMap, u32, FxBuildHasher>, +} + +/// Inner loop for non-null chunks of 64 values. +#[inline(always)] +fn inner_loop_nonnull( + values: &[T; 64], + count_distinct_values: bool, + state: &mut LoopState, +) where + NativeValue: Eq + Hash, +{ + for &value in values { + if count_distinct_values { + *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; + } + + if value != state.prev { + state.prev = value; + state.runs += 1; + } + } +} + +/// Inner loop for nullable chunks of 64 values. +#[inline(always)] +fn inner_loop_nullable( + values: &[T; 64], + count_distinct_values: bool, + is_valid: &BitBuffer, + state: &mut LoopState, +) where + NativeValue: Eq + Hash, +{ + for (idx, &value) in values.iter().enumerate() { + if is_valid.value(idx) { + if count_distinct_values { + *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; + } + + if value != state.prev { + state.prev = value; + state.runs += 1; + } + } + } +} + +/// Fallback inner loop for remainder values. +#[inline(always)] +fn inner_loop_naive( + values: &[T], + count_distinct_values: bool, + is_valid: &BitBuffer, + state: &mut LoopState, +) where + NativeValue: Eq + Hash, +{ + for (idx, &value) in values.iter().enumerate() { + if is_valid.value(idx) { + if count_distinct_values { + *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; + } + + if value != state.prev { + state.prev = value; + state.runs += 1; + } + } + } +} + +#[cfg(test)] +mod tests { + use std::iter; + + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::BitBuffer; + use vortex_buffer::Buffer; + use vortex_buffer::buffer; + use vortex_error::VortexResult; + + use super::IntegerStats; + use super::typed_int_stats; + + #[test] + fn test_naive_count_distinct_values() -> VortexResult<()> { + let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable); + let stats = typed_int_stats::(&array, true)?; + assert_eq!(stats.distinct_count().unwrap(), 2); + Ok(()) + } + + #[test] + fn test_naive_count_distinct_values_nullable() -> VortexResult<()> { + let array = PrimitiveArray::new( + buffer![217u8, 0], + Validity::from(BitBuffer::from(vec![true, false])), + ); + let stats = typed_int_stats::(&array, true)?; + assert_eq!(stats.distinct_count().unwrap(), 1); + Ok(()) + } + + #[test] + fn test_count_distinct_values() -> VortexResult<()> { + let array = PrimitiveArray::new((0..128u8).collect::>(), Validity::NonNullable); + let stats = typed_int_stats::(&array, true)?; + assert_eq!(stats.distinct_count().unwrap(), 128); + Ok(()) + } + + #[test] + fn test_count_distinct_values_nullable() -> VortexResult<()> { + let array = PrimitiveArray::new( + (0..128u8).collect::>(), + Validity::from(BitBuffer::from_iter( + iter::repeat_n(vec![true, false], 64).flatten(), + )), + ); + let stats = typed_int_stats::(&array, true)?; + assert_eq!(stats.distinct_count().unwrap(), 64); + Ok(()) + } + + #[test] + fn test_integer_stats_leading_nulls() { + let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true])); + + let stats = IntegerStats::generate_opts( + &ints, + crate::stats::GenerateStatsOptions { + count_distinct_values: true, + }, + ); + + assert_eq!(stats.value_count, 2); + assert_eq!(stats.null_count, 1); + assert_eq!(stats.average_run_length, 1); + assert_eq!(stats.distinct_count().unwrap(), 2); + } +} diff --git a/vortex-compressor/src/stats/mod.rs b/vortex-compressor/src/stats/mod.rs new file mode 100644 index 00000000000..e4417b66b3d --- /dev/null +++ b/vortex-compressor/src/stats/mod.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression statistics types and caching. + +mod cache; +mod float; +mod integer; +mod options; +mod string; + +pub use cache::ArrayAndStats; +pub use float::DistinctInfo as FloatDistinctInfo; +pub use float::ErasedStats as FloatErasedStats; +pub use float::FloatStats; +pub use float::TypedStats as FloatTypedStats; +pub use integer::DistinctInfo as IntegerDistinctInfo; +pub use integer::ErasedStats as IntegerErasedStats; +pub use integer::IntegerStats; +pub use integer::TypedStats as IntegerTypedStats; +pub use options::GenerateStatsOptions; +pub use string::StringStats; diff --git a/vortex-compressor/src/stats/options.rs b/vortex-compressor/src/stats/options.rs new file mode 100644 index 00000000000..d53b69d748a --- /dev/null +++ b/vortex-compressor/src/stats/options.rs @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression statistics types. + +/// Configures how stats are generated. +/// +/// Each scheme declares its required options via [`Scheme::stats_options`]. The compressor +/// merges all eligible schemes' options before generating stats, so that a single stats pass +/// satisfies every scheme. +/// +/// [`Scheme::stats_options`]: crate::scheme::Scheme::stats_options +#[derive(Debug, Default, Clone, Copy)] +pub struct GenerateStatsOptions { + /// Whether distinct values should be counted during stats generation. + pub count_distinct_values: bool, +} + +impl GenerateStatsOptions { + /// Merges two options by OR-ing each field. The result enables a stat if either input does. + pub fn merge(self, other: Self) -> Self { + Self { + count_distinct_values: self.count_distinct_values || other.count_distinct_values, + } + } +} diff --git a/vortex-compressor/src/stats/string.rs b/vortex-compressor/src/stats/string.rs new file mode 100644 index 00000000000..f8db9d0c4f2 --- /dev/null +++ b/vortex-compressor/src/stats/string.rs @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! String compression statistics. + +use vortex_array::arrays::VarBinViewArray; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_utils::aliases::hash_set::HashSet; + +use super::GenerateStatsOptions; + +/// Array of variable-length byte arrays, and relevant stats for compression. +#[derive(Clone, Debug)] +pub struct StringStats { + /// The underlying source array. + src: VarBinViewArray, + /// The estimated number of distinct strings, or `None` if not computed. + estimated_distinct_count: Option, + /// The number of non-null values. + value_count: u32, + /// The number of null values. + null_count: u32, +} + +/// Estimate the number of distinct strings in the var bin view array. +fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult { + let views = strings.views(); + // Iterate the views. Two strings which are equal must have the same first 8-bytes. + // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all + // share a 4-byte prefix and have the same length. + let mut distinct = HashSet::with_capacity(views.len() / 2); + views.iter().for_each(|&view| { + #[expect( + clippy::cast_possible_truncation, + reason = "approximate uniqueness with view prefix" + )] + let len_and_prefix = view.as_u128() as u64; + distinct.insert(len_and_prefix); + }); + + Ok(u32::try_from(distinct.len())?) +} + +impl StringStats { + /// Generates stats, returning an error on failure. + fn generate_opts_fallible( + input: &VarBinViewArray, + opts: GenerateStatsOptions, + ) -> VortexResult { + let null_count = input + .statistics() + .compute_null_count() + .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; + let value_count = input.len() - null_count; + let estimated_distinct_count = opts + .count_distinct_values + .then(|| estimate_distinct_count(input)) + .transpose()?; + + Ok(Self { + src: input.clone(), + value_count: u32::try_from(value_count)?, + null_count: u32::try_from(null_count)?, + estimated_distinct_count, + }) + } +} + +impl StringStats { + /// Generates stats with default options. + pub fn generate(input: &VarBinViewArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } + + /// Generates stats with provided options. + pub fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self { + Self::generate_opts_fallible(input, opts) + .vortex_expect("StringStats::generate_opts should not fail") + } + + /// Returns the underlying source array. + pub fn source(&self) -> &VarBinViewArray { + &self.src + } + + /// Returns the estimated number of distinct strings, or `None` if not computed. + pub fn estimated_distinct_count(&self) -> Option { + self.estimated_distinct_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { + self.value_count + } + + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } +} diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index 4d6031a220c..efd693c5ca1 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -28,14 +28,6 @@ use vortex_array::arrays::VarBinView; use vortex_array::dtype::FieldPath; use vortex_array::session::ArrayRegistry; use vortex_array::session::ArraySession; -#[cfg(feature = "zstd")] -use vortex_btrblocks::BtrBlocksCompressorBuilder; -#[cfg(feature = "zstd")] -use vortex_btrblocks::FloatCode; -#[cfg(feature = "zstd")] -use vortex_btrblocks::IntCode; -#[cfg(feature = "zstd")] -use vortex_btrblocks::StringCode; use vortex_bytebool::ByteBool; use vortex_datetime_parts::DateTimeParts; use vortex_decimal_byte_parts::DecimalByteParts; @@ -63,6 +55,16 @@ use vortex_sequence::Sequence; use vortex_sparse::Sparse; use vortex_utils::aliases::hash_map::HashMap; use vortex_zigzag::ZigZag; + +#[rustfmt::skip] +#[cfg(feature = "zstd")] +use vortex_btrblocks::{ + BtrBlocksCompressorBuilder, + SchemeExt, + schemes::float, + schemes::integer, + schemes::string, +}; #[cfg(feature = "zstd")] use vortex_zstd::Zstd; #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] @@ -196,18 +198,22 @@ impl WriteStrategyBuilder { /// GPU decompression. Without it, strings use interleaved Zstd compression. #[cfg(feature = "zstd")] pub fn with_cuda_compatible_encodings(mut self) -> Self { - let mut builder = BtrBlocksCompressorBuilder::default() - .exclude_int([IntCode::Sparse, IntCode::Rle]) - .exclude_float([FloatCode::Rle, FloatCode::Sparse]) - .exclude_string([StringCode::Dict, StringCode::Fsst]); + let mut builder = BtrBlocksCompressorBuilder::default().exclude([ + integer::SparseScheme.id(), + integer::RLE_INTEGER_SCHEME.id(), + float::RLE_FLOAT_SCHEME.id(), + float::NullDominatedSparseScheme.id(), + string::StringDictScheme.id(), + string::FSSTScheme.id(), + ]); #[cfg(feature = "unstable_encodings")] { - builder = builder.include_string([StringCode::ZstdBuffers]); + builder = builder.include([string::ZstdBuffersScheme.id()]); } #[cfg(not(feature = "unstable_encodings"))] { - builder = builder.include_string([StringCode::Zstd]); + builder = builder.include([string::ZstdScheme.id()]); } self.compressor = Some(Arc::new(builder.build())); @@ -222,9 +228,11 @@ impl WriteStrategyBuilder { #[cfg(feature = "zstd")] pub fn with_compact_encodings(mut self) -> Self { let btrblocks = BtrBlocksCompressorBuilder::default() - .include_string([StringCode::Zstd]) - .include_int([IntCode::Pco]) - .include_float([FloatCode::Pco]) + .include([ + string::ZstdScheme.id(), + integer::PcoScheme.id(), + float::PcoScheme.id(), + ]) .build(); self.compressor = Some(Arc::new(btrblocks)); diff --git a/vortex-layout/src/layouts/compressed.rs b/vortex-layout/src/layouts/compressed.rs index 58ba381d415..603b2360e0d 100644 --- a/vortex-layout/src/layouts/compressed.rs +++ b/vortex-layout/src/layouts/compressed.rs @@ -11,7 +11,8 @@ use vortex_array::DynArray; use vortex_array::expr::stats::Stat; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; -use vortex_btrblocks::IntCode; +use vortex_btrblocks::SchemeExt; +use vortex_btrblocks::schemes::integer::IntDictScheme; use vortex_error::VortexResult; use vortex_io::runtime::Handle; @@ -69,7 +70,7 @@ impl CompressingStrategy { pub fn new_btrblocks(child: S, exclude_int_dict_encoding: bool) -> Self { let compressor = if exclude_int_dict_encoding { BtrBlocksCompressorBuilder::default() - .exclude_int([IntCode::Dict]) + .exclude([IntDictScheme.id()]) .build() } else { BtrBlocksCompressor::default() diff --git a/vortex/public-api.lock b/vortex/public-api.lock index 0c8ce9d0cd9..7be026902db 100644 --- a/vortex/public-api.lock +++ b/vortex/public-api.lock @@ -22,11 +22,9 @@ pub use vortex::compressor::BtrBlocksCompressor pub use vortex::compressor::BtrBlocksCompressorBuilder -pub use vortex::compressor::FloatCode +pub use vortex::compressor::Scheme -pub use vortex::compressor::IntCode - -pub use vortex::compressor::StringCode +pub use vortex::compressor::SchemeId pub mod vortex::dtype diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs index a532fc1adad..ab22ea36f4e 100644 --- a/vortex/src/lib.rs +++ b/vortex/src/lib.rs @@ -36,9 +36,8 @@ pub mod buffer { pub mod compressor { pub use vortex_btrblocks::BtrBlocksCompressor; pub use vortex_btrblocks::BtrBlocksCompressorBuilder; - pub use vortex_btrblocks::FloatCode; - pub use vortex_btrblocks::IntCode; - pub use vortex_btrblocks::StringCode; + pub use vortex_btrblocks::Scheme; + pub use vortex_btrblocks::SchemeId; } pub mod dtype {