shiguredo · voluntas · Jan 27, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/CHANGES.md b/CHANGES.md
@@ -13,9 +13,10 @@
 
 - [ADD] VideoEncoder にスケーリング機能を追加する
   - WebCodecs API 仕様に準拠: encode で渡されるフレームの解像度と configure で指定した解像度が異なる場合に自動的にスケーリング
-  - Apple Video Toolbox: VTPixelTransferSession を使用 (Metal ベースの HW アクセラレーション)
-  - ソフトウェアエンコーダー (AV1/VP8/VP9): libyuv の I420Scale を使用
-  - NVENC / Intel VPL: libyuv の I420Scale を使用
+  - 対応ピクセルフォーマット: I420, I422, I444, NV12, RGBA, BGRA, RGB, BGR
+  - Apple Video Toolbox: VTPixelTransferSession を使用 (I420, NV12, BGRA のみ直接対応、他は NV12 に変換)
+  - ソフトウェアエンコーダー (AV1/VP8/VP9): libyuv を使用してフォーマット変換とスケーリング
+  - NVENC / Intel VPL: libyuv を使用してフォーマット変換とスケーリング
   - @voluntas
 - [ADD] VP9 で scalabilityMode (L1T2/L1T3) をサポートする
   - VideoEncoderConfig で `scalability_mode` を指定可能

diff --git a/docs/PYTHON_INTERFACE.md b/docs/PYTHON_INTERFACE.md
@@ -949,18 +949,20 @@ encoder.close()
 
 **スケーリング実装の詳細**:
 
-| エンコーダー | スケーリング方式 | 備考 |
-|------------|----------------|------|
-| Apple Video Toolbox (H.264/HEVC) | VTPixelTransferSession | Metal ベースの HW アクセラレーション |
-| ソフトウェアエンコーダー (AV1/VP8/VP9) | libyuv I420Scale | kFilterBox 補間 |
-| NVIDIA Video Codec SDK (NVENC) | libyuv I420Scale | NV12→I420→スケーリング→NV12 |
-| Intel VPL | libyuv I420Scale | NV12→I420→スケーリング→NV12 |
+| エンコーダー | スケーリング方式 | 対応フォーマット |
+|------------|----------------|----------------|
+| Apple Video Toolbox (H.264/HEVC) | VTPixelTransferSession (HWA) | I420, NV12, BGRA |
+| ソフトウェアエンコーダー (AV1/VP8/VP9) | libyuv (各フォーマット対応) | I420, I422, I444, NV12, RGBA, BGRA, RGB, BGR |
+| NVIDIA Video Codec SDK (NVENC) | libyuv (各フォーマット対応) | I420, I422, I444, NV12, RGBA, BGRA, RGB, BGR |
+| Intel VPL | libyuv (各フォーマット対応) | I420, I422, I444, NV12, RGBA, BGRA, RGB, BGR |
 
 **注意事項**:
 
 - スケーリングはダウンスケール、アップスケールの両方に対応
 - アスペクト比は `configure()` で指定した解像度に合わせられる（引き伸ばし）
 - 同じ解像度のフレームはスケーリング処理をスキップ
+- 入力フォーマットに応じた libyuv スケーラーが使用される (I420Scale, I422Scale, I444Scale, NV12Scale, ARGBScale)
+- RGB/BGR フォーマットは I420/NV12 に変換後スケーリング (libyuv に RGBScale がないため)
 
 ## 独自インターフェース
 

diff --git a/src/bindings/video_encoder.cpp b/src/bindings/video_encoder.cpp
@@ -260,6 +260,7 @@ static ScalabilityModeConfig parse_scalability_mode(const std::string& mode) {
 #include "video_encoder_aom.cpp"
 #include "video_encoder_apple_video_toolbox.cpp"
 #include "video_encoder_nvidia.cpp"
+#include "video_scaler.cpp"
 #if defined(__APPLE__) || defined(__linux__)
 #include "video_encoder_vpx.cpp"
 #endif

diff --git a/src/bindings/video_encoder_aom.cpp b/src/bindings/video_encoder_aom.cpp
@@ -3,9 +3,8 @@
 #include <cstring>
 #include <thread>
 
-#include <libyuv.h>
-
 #include "video_encoder.h"
+#include "video_scaler.h"
 
 // WebRTC の NumberOfThreads ロジックに準拠
 // タイル数（1, 2, 4, 8）に合わせてスレッド数を決定
@@ -363,53 +362,16 @@ void VideoEncoder::encode_frame_aom(const VideoFrame& frame,
     svc_metadata = SvcOutputMetadata(temporal_layer_id);
   }
 
-  // スケーリングが必要かどうかを判定
-  bool needs_scaling =
-      (frame.width() != config_.width || frame.height() != config_.height);
-
-  // スケーリング用のバッファ
-  std::vector<uint8_t> scaled_buffer;
-  const uint8_t* src_y = frame.plane_ptr(0);
-  const uint8_t* src_u = frame.plane_ptr(1);
-  const uint8_t* src_v = frame.plane_ptr(2);
-  int src_stride_y = static_cast<int>(frame.width());
-  int src_stride_u = static_cast<int>(frame.width() / 2);
-  int src_stride_v = static_cast<int>(frame.width() / 2);
-
-  // スケーリングが必要な場合は libyuv で変換
-  if (needs_scaling) {
-    uint32_t dst_width = config_.width;
-    uint32_t dst_height = config_.height;
-    size_t y_size = dst_width * dst_height;
-    size_t uv_size = (dst_width / 2) * (dst_height / 2);
-    scaled_buffer.resize(y_size + uv_size * 2);
-
-    uint8_t* dst_y = scaled_buffer.data();
-    uint8_t* dst_u = dst_y + y_size;
-    uint8_t* dst_v = dst_u + uv_size;
-    int dst_stride_y = static_cast<int>(dst_width);
-    int dst_stride_u = static_cast<int>(dst_width / 2);
-    int dst_stride_v = static_cast<int>(dst_width / 2);
-
-    int result = libyuv::I420Scale(
-        src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-        static_cast<int>(frame.width()), static_cast<int>(frame.height()),
-        dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
-        static_cast<int>(dst_width), static_cast<int>(dst_height),
-        libyuv::kFilterBox);
-
-    if (result != 0) {
-      throw std::runtime_error("libyuv::I420Scale failed");
-    }
+  // スケーリングと I420 変換
+  auto scaled =
+      video_scaler::scale_to_i420(frame, config_.width, config_.height);
 
-    // スケーリング後のポインタとストライドを更新
-    src_y = dst_y;
-    src_u = dst_u;
-    src_v = dst_v;
-    src_stride_y = dst_stride_y;
-    src_stride_u = dst_stride_u;
-    src_stride_v = dst_stride_v;
-  }
+  const uint8_t* src_y = scaled.y;
+  const uint8_t* src_u = scaled.u;
+  const uint8_t* src_v = scaled.v;
+  int src_stride_y = scaled.stride_y;
+  int src_stride_u = scaled.stride_u;
+  int src_stride_v = scaled.stride_v;
 
   // Wrap I420 memory from VideoFrame or scaled buffer
   aom_image_t img;

diff --git a/src/bindings/video_encoder_apple_video_toolbox.cpp b/src/bindings/video_encoder_apple_video_toolbox.cpp
@@ -4,12 +4,13 @@
 #include <CoreFoundation/CoreFoundation.h>
 #include <CoreVideo/CoreVideo.h>
 #include <VideoToolbox/VideoToolbox.h>
+#include <libyuv.h>
 #include <nanobind/nanobind.h>
 #include <memory>
 #include <vector>
 
 #include "encoded_video_chunk.h"
-#include "video_frame.h"  // VideoFrame の完全な定義が必要
+#include "video_frame.h"
 
 namespace nb = nanobind;
 
@@ -446,15 +447,41 @@ void VideoEncoder::encode_frame_videotoolbox(
 
   // native_buffer がない場合は CVPixelBuffer を作成してコピー
   if (!pb_from_native) {
-    // Make sure we have NV12 source
-    std::unique_ptr<VideoFrame> nv12;
-    if (frame.format() != VideoPixelFormat::NV12) {
-      nv12 = frame.convert_format(VideoPixelFormat::NV12);
+    // スケーリング時は VTPixelTransferSession でフォーマット変換とスケーリングを同時に行う
+    // VTPixelTransferSession がサポートするフォーマット: I420, NV12, BGRA
+    // スケーリングなしの場合は NV12 に変換が必要
+    bool use_native_format =
+        needs_scaling && (frame.format() == VideoPixelFormat::I420 ||
+                          frame.format() == VideoPixelFormat::NV12 ||
+                          frame.format() == VideoPixelFormat::BGRA);
+
+    // 入力フレームを変換するかどうかを決定
+    std::unique_ptr<VideoFrame> converted;
+    const VideoFrame* src_frame = &frame;
+
+    if (!use_native_format && frame.format() != VideoPixelFormat::NV12) {
+      // VTPixelTransferSession がサポートしないフォーマット、またはスケーリングなしの場合
+      // NV12 に変換
+      converted = frame.convert_format(VideoPixelFormat::NV12);
+      src_frame = converted.get();
+    }
+
+    // CVPixelBuffer のピクセルフォーマットを決定
+    OSType pixel_format;
+    switch (src_frame->format()) {
+      case VideoPixelFormat::I420:
+        pixel_format = kCVPixelFormatType_420YpCbCr8Planar;
+        break;
+      case VideoPixelFormat::BGRA:
+        pixel_format = kCVPixelFormatType_32BGRA;
+        break;
+      case VideoPixelFormat::NV12:
+      default:
+        pixel_format = kCVPixelFormatType_420YpCbCr8BiPlanarFullRange;
+        break;
     }
-    const VideoFrame& src = nv12 ? *nv12 : frame;
 
     // 入力フレームサイズの CVPixelBuffer を作成
-    OSType pixel_format = kCVPixelFormatType_420YpCbCr8BiPlanarFullRange;
     CFDictionaryRef empty_dict = CFDictionaryCreate(
         kCFAllocatorDefault, nullptr, nullptr, 0,
         &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
@@ -464,8 +491,9 @@ void VideoEncoder::encode_frame_videotoolbox(
         kCFAllocatorDefault, pb_keys, pb_vals, 1,
         &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
 
-    CVReturn r = CVPixelBufferCreate(kCFAllocatorDefault, src.width(),
-                                     src.height(), pixel_format, pb_attrs, &pb);
+    CVReturn r =
+        CVPixelBufferCreate(kCFAllocatorDefault, src_frame->width(),
+                            src_frame->height(), pixel_format, pb_attrs, &pb);
 
     CFRelease(pb_attrs);
     CFRelease(empty_dict);
@@ -474,37 +502,79 @@ void VideoEncoder::encode_frame_videotoolbox(
       throw std::runtime_error("Failed to create CVPixelBuffer for input");
     }
 
-    // Copy planes into CVPixelBuffer
+    // フォーマットに応じてデータをコピー
     CVPixelBufferLockBaseAddress(pb, 0);
-    uint8_t* dst_y = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 0);
-    size_t dst_stride_y = CVPixelBufferGetBytesPerRowOfPlane(pb, 0);
-    uint8_t* dst_uv = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 1);
-    size_t dst_stride_uv = CVPixelBufferGetBytesPerRowOfPlane(pb, 1);
-
-    const uint8_t* src_y = src.plane_ptr(0);
-    const uint8_t* src_uv = src.plane_ptr(1);
-    int width = static_cast<int>(src.width());
-    int height = static_cast<int>(src.height());
-    int chroma_height = (height + 1) / 2;
-    // Y plane
-    if (dst_stride_y == static_cast<size_t>(width)) {
-      memcpy(dst_y, src_y, static_cast<size_t>(width * height));
-    } else {
-      for (int i = 0; i < height; ++i) {
-        memcpy(dst_y + i * dst_stride_y, src_y + i * width, width);
+
+    switch (src_frame->format()) {
+      case VideoPixelFormat::I420: {
+        // I420: 3 プレーン (Y, U, V)
+        int width = static_cast<int>(src_frame->width());
+        int height = static_cast<int>(src_frame->height());
+        int chroma_width = (width + 1) / 2;
+        int chroma_height = (height + 1) / 2;
+
+        // Y plane
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(0), width,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 0),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 0)), width,
+            height);
+
+        // U plane
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(1), chroma_width,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 1),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 1)),
+            chroma_width, chroma_height);
+
+        // V plane
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(2), chroma_width,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 2),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 2)),
+            chroma_width, chroma_height);
+        break;
       }
-    }
-    // UV plane (interleaved)
-    int chroma_row_bytes = ((width + 1) / 2) * 2;
-    if (dst_stride_uv == static_cast<size_t>(chroma_row_bytes)) {
-      memcpy(dst_uv, src_uv,
-             static_cast<size_t>(chroma_row_bytes * chroma_height));
-    } else {
-      for (int i = 0; i < chroma_height; ++i) {
-        memcpy(dst_uv + i * dst_stride_uv, src_uv + i * chroma_row_bytes,
-               chroma_row_bytes);
+
+      case VideoPixelFormat::BGRA: {
+        // BGRA: 単一プレーン
+        int width = static_cast<int>(src_frame->width());
+        int height = static_cast<int>(src_frame->height());
+        int row_bytes = width * 4;
+
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(0), row_bytes,
+            (uint8_t*)CVPixelBufferGetBaseAddress(pb),
+            static_cast<int>(CVPixelBufferGetBytesPerRow(pb)), row_bytes,
+            height);
+        break;
+      }
+
+      case VideoPixelFormat::NV12:
+      default: {
+        // NV12: 2 プレーン (Y, UV)
+        int width = static_cast<int>(src_frame->width());
+        int height = static_cast<int>(src_frame->height());
+        int chroma_height = (height + 1) / 2;
+        int chroma_row_bytes = ((width + 1) / 2) * 2;
+
+        // Y plane
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(0), width,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 0),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 0)), width,
+            height);
+
+        // UV plane (interleaved)
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(1), chroma_row_bytes,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 1),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 1)),
+            chroma_row_bytes, chroma_height);
+        break;
       }
     }
+
     CVPixelBufferUnlockBaseAddress(pb, 0);
   }
 
@@ -533,7 +603,8 @@ void VideoEncoder::encode_frame_videotoolbox(
           "Failed to create scaled CVPixelBuffer from pool");
     }
 
-    // VTPixelTransferSessionTransferImage でスケーリング
+    // VTPixelTransferSessionTransferImage でスケーリングとフォーマット変換を実行
+    // 入力は I420/NV12/BGRA のいずれか、出力は NV12
     auto transfer_session =
         (VTPixelTransferSessionRef)vt_pixel_transfer_session_;
     OSStatus transfer_err =