From 3c0516641a0ee38664f664bcf580515972323303 Mon Sep 17 00:00:00 2001 From: rmatif Date: Mon, 29 Dec 2025 15:48:36 +0100 Subject: [PATCH 1/3] enable vae tiling for vid gen --- examples/cli/main.cpp | 1 + stable-diffusion.cpp | 17 ++++++++++++----- stable-diffusion.h | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 77ef1c935..cb8aac7ff 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -668,6 +668,7 @@ int main(int argc, const char* argv[]) { gen_params.seed, gen_params.video_frames, gen_params.vace_strength, + ctx_params.vae_tiling_params, gen_params.cache_params, }; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 58d420415..c1376bfe7 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2414,10 +2414,15 @@ class StableDiffusionGGML { ne2 = 1; ne3 = C * x->ne[3]; } else { - if (!use_tiny_autoencoder) { - C *= 2; + int out_channels = C; + bool encode_outputs_mu = use_tiny_autoencoder || + sd_version_is_wan(version) || + sd_version_is_flux2(version) || + version == VERSION_CHROMA_RADIANCE; + if (!encode_outputs_mu) { + out_channels *= 2; } - ne2 = C; + ne2 = out_channels; ne3 = x->ne[3]; } result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3); @@ -2558,7 +2563,7 @@ class StableDiffusionGGML { } process_latent_out(x); // x = load_tensor_from_file(work_ctx, "wan_vae_z.bin"); - if (vae_tiling_params.enabled && !decode_video) { + if (vae_tiling_params.enabled) { float tile_overlap; int tile_size_x, tile_size_y; get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, x->ne[0], x->ne[1]); @@ -2576,7 +2581,7 @@ class StableDiffusionGGML { first_stage_model->free_compute_buffer(); process_vae_output_tensor(result); } else { - if (vae_tiling_params.enabled && !decode_video) { + if (vae_tiling_params.enabled) { // split latent in 64x64 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { tae_first_stage->compute(n_threads, in, true, &out); @@ -3023,6 +3028,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { sd_vid_gen_params->video_frames = 6; sd_vid_gen_params->moe_boundary = 0.875f; sd_vid_gen_params->vace_strength = 1.f; + sd_vid_gen_params->vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; sd_cache_params_init(&sd_vid_gen_params->cache); } @@ -3744,6 +3750,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) { return nullptr; } + sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params; std::string prompt = SAFE_STR(sd_vid_gen_params->prompt); std::string negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt); diff --git a/stable-diffusion.h b/stable-diffusion.h index 3e9faf854..45190eda6 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -318,6 +318,7 @@ typedef struct { int64_t seed; int video_frames; float vace_strength; + sd_tiling_params_t vae_tiling_params; sd_cache_params_t cache; } sd_vid_gen_params_t; From ebe21ecb8e65e6287e04ea7c4425d806726ccce1 Mon Sep 17 00:00:00 2001 From: leejet Date: Thu, 8 Jan 2026 23:16:39 +0800 Subject: [PATCH 2/3] format code --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index c1376bfe7..da51962cf 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2414,7 +2414,7 @@ class StableDiffusionGGML { ne2 = 1; ne3 = C * x->ne[3]; } else { - int out_channels = C; + int out_channels = C; bool encode_outputs_mu = use_tiny_autoencoder || sd_version_is_wan(version) || sd_version_is_flux2(version) || From ce78b597010e53527faa5ce923c04b27debc2787 Mon Sep 17 00:00:00 2001 From: leejet Date: Thu, 8 Jan 2026 23:19:05 +0800 Subject: [PATCH 3/3] eliminate compilation warning --- stable-diffusion.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ef396e16e..60bcba4d3 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2489,7 +2489,7 @@ class StableDiffusionGGML { ne2 = 1; ne3 = C * x->ne[3]; } else { - int out_channels = C; + int64_t out_channels = C; bool encode_outputs_mu = use_tiny_autoencoder || sd_version_is_wan(version) || sd_version_is_flux2(version) || @@ -3051,7 +3051,8 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { "height: %d\n" "sample_params: %s\n" "strength: %.2f\n" - "seed: %" PRId64 "\n" + "seed: %" PRId64 + "\n" "batch_count: %d\n" "ref_images_count: %d\n" "auto_resize_ref_image: %s\n"