diff --git a/CHANGES.md b/CHANGES.md
index 06417c4..3d1c357 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -13,9 +13,10 @@
 
 - [ADD] VideoEncoder にスケーリング機能を追加する
   - WebCodecs API 仕様に準拠: encode で渡されるフレームの解像度と configure で指定した解像度が異なる場合に自動的にスケーリング
-  - Apple Video Toolbox: VTPixelTransferSession を使用 (Metal ベースの HW アクセラレーション)
-  - ソフトウェアエンコーダー (AV1/VP8/VP9): libyuv の I420Scale を使用
-  - NVENC / Intel VPL: libyuv の I420Scale を使用
+  - 対応ピクセルフォーマット: I420, I422, I444, NV12, RGBA, BGRA, RGB, BGR
+  - Apple Video Toolbox: VTPixelTransferSession を使用 (I420, NV12, BGRA のみ直接対応、他は NV12 に変換)
+  - ソフトウェアエンコーダー (AV1/VP8/VP9): libyuv を使用してフォーマット変換とスケーリング
+  - NVENC / Intel VPL: libyuv を使用してフォーマット変換とスケーリング
   - @voluntas
 - [ADD] VP9 で scalabilityMode (L1T2/L1T3) をサポートする
   - VideoEncoderConfig で `scalability_mode` を指定可能
diff --git a/docs/PYTHON_INTERFACE.md b/docs/PYTHON_INTERFACE.md
index 6aa8171..eaf6429 100644
--- a/docs/PYTHON_INTERFACE.md
+++ b/docs/PYTHON_INTERFACE.md
@@ -949,18 +949,20 @@ encoder.close()
 
 **スケーリング実装の詳細**:
 
-| エンコーダー | スケーリング方式 | 備考 |
-|------------|----------------|------|
-| Apple Video Toolbox (H.264/HEVC) | VTPixelTransferSession | Metal ベースの HW アクセラレーション |
-| ソフトウェアエンコーダー (AV1/VP8/VP9) | libyuv I420Scale | kFilterBox 補間 |
-| NVIDIA Video Codec SDK (NVENC) | libyuv I420Scale | NV12→I420→スケーリング→NV12 |
-| Intel VPL | libyuv I420Scale | NV12→I420→スケーリング→NV12 |
+| エンコーダー | スケーリング方式 | 対応フォーマット |
+|------------|----------------|----------------|
+| Apple Video Toolbox (H.264/HEVC) | VTPixelTransferSession (HWA) | I420, NV12, BGRA |
+| ソフトウェアエンコーダー (AV1/VP8/VP9) | libyuv (各フォーマット対応) | I420, I422, I444, NV12, RGBA, BGRA, RGB, BGR |
+| NVIDIA Video Codec SDK (NVENC) | libyuv (各フォーマット対応) | I420, I422, I444, NV12, RGBA, BGRA, RGB, BGR |
+| Intel VPL | libyuv (各フォーマット対応) | I420, I422, I444, NV12, RGBA, BGRA, RGB, BGR |
 
 **注意事項**:
 
 - スケーリングはダウンスケール、アップスケールの両方に対応
 - アスペクト比は `configure()` で指定した解像度に合わせられる（引き伸ばし）
 - 同じ解像度のフレームはスケーリング処理をスキップ
+- 入力フォーマットに応じた libyuv スケーラーが使用される (I420Scale, I422Scale, I444Scale, NV12Scale, ARGBScale)
+- RGB/BGR フォーマットは I420/NV12 に変換後スケーリング (libyuv に RGBScale がないため)
 
 ## 独自インターフェース
 
diff --git a/src/bindings/video_encoder.cpp b/src/bindings/video_encoder.cpp
index 2e77158..2e33886 100644
--- a/src/bindings/video_encoder.cpp
+++ b/src/bindings/video_encoder.cpp
@@ -260,6 +260,7 @@ static ScalabilityModeConfig parse_scalability_mode(const std::string& mode) {
 #include "video_encoder_aom.cpp"
 #include "video_encoder_apple_video_toolbox.cpp"
 #include "video_encoder_nvidia.cpp"
+#include "video_scaler.cpp"
 #if defined(__APPLE__) || defined(__linux__)
 #include "video_encoder_vpx.cpp"
 #endif
diff --git a/src/bindings/video_encoder_aom.cpp b/src/bindings/video_encoder_aom.cpp
index e72e7fe..58b105c 100644
--- a/src/bindings/video_encoder_aom.cpp
+++ b/src/bindings/video_encoder_aom.cpp
@@ -3,9 +3,8 @@
 #include <cstring>
 #include <thread>
 
-#include <libyuv.h>
-
 #include "video_encoder.h"
+#include "video_scaler.h"
 
 // WebRTC の NumberOfThreads ロジックに準拠
 // タイル数（1, 2, 4, 8）に合わせてスレッド数を決定
@@ -363,53 +362,16 @@ void VideoEncoder::encode_frame_aom(const VideoFrame& frame,
     svc_metadata = SvcOutputMetadata(temporal_layer_id);
   }
 
-  // スケーリングが必要かどうかを判定
-  bool needs_scaling =
-      (frame.width() != config_.width || frame.height() != config_.height);
-
-  // スケーリング用のバッファ
-  std::vector<uint8_t> scaled_buffer;
-  const uint8_t* src_y = frame.plane_ptr(0);
-  const uint8_t* src_u = frame.plane_ptr(1);
-  const uint8_t* src_v = frame.plane_ptr(2);
-  int src_stride_y = static_cast<int>(frame.width());
-  int src_stride_u = static_cast<int>(frame.width() / 2);
-  int src_stride_v = static_cast<int>(frame.width() / 2);
-
-  // スケーリングが必要な場合は libyuv で変換
-  if (needs_scaling) {
-    uint32_t dst_width = config_.width;
-    uint32_t dst_height = config_.height;
-    size_t y_size = dst_width * dst_height;
-    size_t uv_size = (dst_width / 2) * (dst_height / 2);
-    scaled_buffer.resize(y_size + uv_size * 2);
-
-    uint8_t* dst_y = scaled_buffer.data();
-    uint8_t* dst_u = dst_y + y_size;
-    uint8_t* dst_v = dst_u + uv_size;
-    int dst_stride_y = static_cast<int>(dst_width);
-    int dst_stride_u = static_cast<int>(dst_width / 2);
-    int dst_stride_v = static_cast<int>(dst_width / 2);
-
-    int result = libyuv::I420Scale(
-        src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-        static_cast<int>(frame.width()), static_cast<int>(frame.height()),
-        dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
-        static_cast<int>(dst_width), static_cast<int>(dst_height),
-        libyuv::kFilterBox);
-
-    if (result != 0) {
-      throw std::runtime_error("libyuv::I420Scale failed");
-    }
+  // スケーリングと I420 変換
+  auto scaled =
+      video_scaler::scale_to_i420(frame, config_.width, config_.height);
 
-    // スケーリング後のポインタとストライドを更新
-    src_y = dst_y;
-    src_u = dst_u;
-    src_v = dst_v;
-    src_stride_y = dst_stride_y;
-    src_stride_u = dst_stride_u;
-    src_stride_v = dst_stride_v;
-  }
+  const uint8_t* src_y = scaled.y;
+  const uint8_t* src_u = scaled.u;
+  const uint8_t* src_v = scaled.v;
+  int src_stride_y = scaled.stride_y;
+  int src_stride_u = scaled.stride_u;
+  int src_stride_v = scaled.stride_v;
 
   // Wrap I420 memory from VideoFrame or scaled buffer
   aom_image_t img;
diff --git a/src/bindings/video_encoder_apple_video_toolbox.cpp b/src/bindings/video_encoder_apple_video_toolbox.cpp
index b4e3385..a4f8b45 100644
--- a/src/bindings/video_encoder_apple_video_toolbox.cpp
+++ b/src/bindings/video_encoder_apple_video_toolbox.cpp
@@ -4,12 +4,13 @@
 #include <CoreFoundation/CoreFoundation.h>
 #include <CoreVideo/CoreVideo.h>
 #include <VideoToolbox/VideoToolbox.h>
+#include <libyuv.h>
 #include <nanobind/nanobind.h>
 #include <memory>
 #include <vector>
 
 #include "encoded_video_chunk.h"
-#include "video_frame.h"  // VideoFrame の完全な定義が必要
+#include "video_frame.h"
 
 namespace nb = nanobind;
 
@@ -446,15 +447,41 @@ void VideoEncoder::encode_frame_videotoolbox(
 
   // native_buffer がない場合は CVPixelBuffer を作成してコピー
   if (!pb_from_native) {
-    // Make sure we have NV12 source
-    std::unique_ptr<VideoFrame> nv12;
-    if (frame.format() != VideoPixelFormat::NV12) {
-      nv12 = frame.convert_format(VideoPixelFormat::NV12);
+    // スケーリング時は VTPixelTransferSession でフォーマット変換とスケーリングを同時に行う
+    // VTPixelTransferSession がサポートするフォーマット: I420, NV12, BGRA
+    // スケーリングなしの場合は NV12 に変換が必要
+    bool use_native_format =
+        needs_scaling && (frame.format() == VideoPixelFormat::I420 ||
+                          frame.format() == VideoPixelFormat::NV12 ||
+                          frame.format() == VideoPixelFormat::BGRA);
+
+    // 入力フレームを変換するかどうかを決定
+    std::unique_ptr<VideoFrame> converted;
+    const VideoFrame* src_frame = &frame;
+
+    if (!use_native_format && frame.format() != VideoPixelFormat::NV12) {
+      // VTPixelTransferSession がサポートしないフォーマット、またはスケーリングなしの場合
+      // NV12 に変換
+      converted = frame.convert_format(VideoPixelFormat::NV12);
+      src_frame = converted.get();
+    }
+
+    // CVPixelBuffer のピクセルフォーマットを決定
+    OSType pixel_format;
+    switch (src_frame->format()) {
+      case VideoPixelFormat::I420:
+        pixel_format = kCVPixelFormatType_420YpCbCr8Planar;
+        break;
+      case VideoPixelFormat::BGRA:
+        pixel_format = kCVPixelFormatType_32BGRA;
+        break;
+      case VideoPixelFormat::NV12:
+      default:
+        pixel_format = kCVPixelFormatType_420YpCbCr8BiPlanarFullRange;
+        break;
     }
-    const VideoFrame& src = nv12 ? *nv12 : frame;
 
     // 入力フレームサイズの CVPixelBuffer を作成
-    OSType pixel_format = kCVPixelFormatType_420YpCbCr8BiPlanarFullRange;
     CFDictionaryRef empty_dict = CFDictionaryCreate(
         kCFAllocatorDefault, nullptr, nullptr, 0,
         &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
@@ -464,8 +491,9 @@ void VideoEncoder::encode_frame_videotoolbox(
         kCFAllocatorDefault, pb_keys, pb_vals, 1,
         &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks);
 
-    CVReturn r = CVPixelBufferCreate(kCFAllocatorDefault, src.width(),
-                                     src.height(), pixel_format, pb_attrs, &pb);
+    CVReturn r =
+        CVPixelBufferCreate(kCFAllocatorDefault, src_frame->width(),
+                            src_frame->height(), pixel_format, pb_attrs, &pb);
 
     CFRelease(pb_attrs);
     CFRelease(empty_dict);
@@ -474,37 +502,79 @@ void VideoEncoder::encode_frame_videotoolbox(
       throw std::runtime_error("Failed to create CVPixelBuffer for input");
     }
 
-    // Copy planes into CVPixelBuffer
+    // フォーマットに応じてデータをコピー
     CVPixelBufferLockBaseAddress(pb, 0);
-    uint8_t* dst_y = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 0);
-    size_t dst_stride_y = CVPixelBufferGetBytesPerRowOfPlane(pb, 0);
-    uint8_t* dst_uv = (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 1);
-    size_t dst_stride_uv = CVPixelBufferGetBytesPerRowOfPlane(pb, 1);
-
-    const uint8_t* src_y = src.plane_ptr(0);
-    const uint8_t* src_uv = src.plane_ptr(1);
-    int width = static_cast<int>(src.width());
-    int height = static_cast<int>(src.height());
-    int chroma_height = (height + 1) / 2;
-    // Y plane
-    if (dst_stride_y == static_cast<size_t>(width)) {
-      memcpy(dst_y, src_y, static_cast<size_t>(width * height));
-    } else {
-      for (int i = 0; i < height; ++i) {
-        memcpy(dst_y + i * dst_stride_y, src_y + i * width, width);
+
+    switch (src_frame->format()) {
+      case VideoPixelFormat::I420: {
+        // I420: 3 プレーン (Y, U, V)
+        int width = static_cast<int>(src_frame->width());
+        int height = static_cast<int>(src_frame->height());
+        int chroma_width = (width + 1) / 2;
+        int chroma_height = (height + 1) / 2;
+
+        // Y plane
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(0), width,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 0),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 0)), width,
+            height);
+
+        // U plane
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(1), chroma_width,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 1),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 1)),
+            chroma_width, chroma_height);
+
+        // V plane
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(2), chroma_width,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 2),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 2)),
+            chroma_width, chroma_height);
+        break;
       }
-    }
-    // UV plane (interleaved)
-    int chroma_row_bytes = ((width + 1) / 2) * 2;
-    if (dst_stride_uv == static_cast<size_t>(chroma_row_bytes)) {
-      memcpy(dst_uv, src_uv,
-             static_cast<size_t>(chroma_row_bytes * chroma_height));
-    } else {
-      for (int i = 0; i < chroma_height; ++i) {
-        memcpy(dst_uv + i * dst_stride_uv, src_uv + i * chroma_row_bytes,
-               chroma_row_bytes);
+
+      case VideoPixelFormat::BGRA: {
+        // BGRA: 単一プレーン
+        int width = static_cast<int>(src_frame->width());
+        int height = static_cast<int>(src_frame->height());
+        int row_bytes = width * 4;
+
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(0), row_bytes,
+            (uint8_t*)CVPixelBufferGetBaseAddress(pb),
+            static_cast<int>(CVPixelBufferGetBytesPerRow(pb)), row_bytes,
+            height);
+        break;
+      }
+
+      case VideoPixelFormat::NV12:
+      default: {
+        // NV12: 2 プレーン (Y, UV)
+        int width = static_cast<int>(src_frame->width());
+        int height = static_cast<int>(src_frame->height());
+        int chroma_height = (height + 1) / 2;
+        int chroma_row_bytes = ((width + 1) / 2) * 2;
+
+        // Y plane
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(0), width,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 0),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 0)), width,
+            height);
+
+        // UV plane (interleaved)
+        libyuv::CopyPlane(
+            src_frame->plane_ptr(1), chroma_row_bytes,
+            (uint8_t*)CVPixelBufferGetBaseAddressOfPlane(pb, 1),
+            static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pb, 1)),
+            chroma_row_bytes, chroma_height);
+        break;
       }
     }
+
     CVPixelBufferUnlockBaseAddress(pb, 0);
   }
 
@@ -533,7 +603,8 @@ void VideoEncoder::encode_frame_videotoolbox(
           "Failed to create scaled CVPixelBuffer from pool");
     }
 
-    // VTPixelTransferSessionTransferImage でスケーリング
+    // VTPixelTransferSessionTransferImage でスケーリングとフォーマット変換を実行
+    // 入力は I420/NV12/BGRA のいずれか、出力は NV12
     auto transfer_session =
         (VTPixelTransferSessionRef)vt_pixel_transfer_session_;
     OSStatus transfer_err =
diff --git a/src/bindings/video_encoder_intel_vpl.cpp b/src/bindings/video_encoder_intel_vpl.cpp
index 56b8bfd..22a2c61 100644
--- a/src/bindings/video_encoder_intel_vpl.cpp
+++ b/src/bindings/video_encoder_intel_vpl.cpp
@@ -11,12 +11,11 @@
 #include <stdexcept>
 #include <vector>
 
-#include <libyuv.h>
-
 #include "../dyn/vpl.h"
 #include "encoded_video_chunk.h"
 #include "intel_vpl_helpers.h"
 #include "video_frame.h"
+#include "video_scaler.h"
 
 namespace nb = nanobind;
 
@@ -312,77 +311,14 @@ void VideoEncoder::encode_frame_intel_vpl(const VideoFrame& frame,
 
   mfxSession session = static_cast<mfxSession>(vpl_session_);
 
-  // NV12 フォーマットに変換
-  std::unique_ptr<VideoFrame> nv12;
-  if (frame.format() != VideoPixelFormat::NV12) {
-    nv12 = frame.convert_format(VideoPixelFormat::NV12);
-  }
-  const VideoFrame& src = nv12 ? *nv12 : frame;
-
-  // スケーリングが必要かどうかを判定
-  bool needs_scaling =
-      (src.width() != config_.width || src.height() != config_.height);
-
-  // スケーリング用のバッファ (I420 経由でスケーリング)
-  std::vector<uint8_t> scaled_i420_buffer;
-  std::vector<uint8_t> scaled_nv12_buffer;
-  const uint8_t* final_y = src.plane_ptr(0);
-  const uint8_t* final_uv = src.plane_ptr(1);
-  uint32_t final_width = src.width();
-  uint32_t final_height = src.height();
-
-  if (needs_scaling) {
-    // NV12 -> I420 に変換してからスケーリング
-    size_t src_i420_size =
-        src.width() * src.height() + (src.width() / 2) * (src.height() / 2) * 2;
-    std::vector<uint8_t> src_i420_buffer(src_i420_size);
-
-    uint8_t* src_i420_y = src_i420_buffer.data();
-    uint8_t* src_i420_u = src_i420_y + src.width() * src.height();
-    uint8_t* src_i420_v = src_i420_u + (src.width() / 2) * (src.height() / 2);
-
-    libyuv::NV12ToI420(src.plane_ptr(0), src.width(), src.plane_ptr(1),
-                       src.width(), src_i420_y, src.width(), src_i420_u,
-                       src.width() / 2, src_i420_v, src.width() / 2,
-                       src.width(), src.height());
-
-    // I420 でスケーリング
-    uint32_t dst_width = config_.width;
-    uint32_t dst_height = config_.height;
-    size_t dst_i420_size =
-        dst_width * dst_height + (dst_width / 2) * (dst_height / 2) * 2;
-    scaled_i420_buffer.resize(dst_i420_size);
-
-    uint8_t* dst_i420_y = scaled_i420_buffer.data();
-    uint8_t* dst_i420_u = dst_i420_y + dst_width * dst_height;
-    uint8_t* dst_i420_v = dst_i420_u + (dst_width / 2) * (dst_height / 2);
-
-    int result = libyuv::I420Scale(
-        src_i420_y, src.width(), src_i420_u, src.width() / 2, src_i420_v,
-        src.width() / 2, src.width(), src.height(), dst_i420_y, dst_width,
-        dst_i420_u, dst_width / 2, dst_i420_v, dst_width / 2, dst_width,
-        dst_height, libyuv::kFilterBox);
-
-    if (result != 0) {
-      throw std::runtime_error("libyuv::I420Scale failed");
-    }
-
-    // I420 -> NV12 に変換
-    size_t nv12_size = dst_width * dst_height * 3 / 2;
-    scaled_nv12_buffer.resize(nv12_size);
+  // スケーリングと NV12 変換
+  auto scaled =
+      video_scaler::scale_to_nv12(frame, config_.width, config_.height);
 
-    uint8_t* nv12_y = scaled_nv12_buffer.data();
-    uint8_t* nv12_uv = nv12_y + dst_width * dst_height;
-
-    libyuv::I420ToNV12(dst_i420_y, dst_width, dst_i420_u, dst_width / 2,
-                       dst_i420_v, dst_width / 2, nv12_y, dst_width, nv12_uv,
-                       dst_width, dst_width, dst_height);
-
-    final_y = nv12_y;
-    final_uv = nv12_uv;
-    final_width = dst_width;
-    final_height = dst_height;
-  }
+  const uint8_t* final_y = scaled.y;
+  const uint8_t* final_uv = scaled.uv;
+  uint32_t final_width = scaled.width;
+  uint32_t final_height = scaled.height;
 
   // サーフェスプールから未使用のサーフェスを取得
   intel_vpl::SurfacePool* pool =
diff --git a/src/bindings/video_encoder_nvidia.cpp b/src/bindings/video_encoder_nvidia.cpp
index deeae2f..d865d51 100644
--- a/src/bindings/video_encoder_nvidia.cpp
+++ b/src/bindings/video_encoder_nvidia.cpp
@@ -13,12 +13,11 @@
 #include <stdexcept>
 #include <vector>
 
-#include <libyuv.h>
-
 #include "../dyn/cuda.h"
 #include "../dyn/nvenc.h"
 #include "encoded_video_chunk.h"
 #include "video_frame.h"
+#include "video_scaler.h"
 
 namespace nb = nanobind;
 
@@ -385,77 +384,14 @@ void VideoEncoder::encode_frame_nvenc(const VideoFrame& frame,
     throw std::runtime_error("NVENC encoder is not initialized");
   }
 
-  // NV12 フォーマットに変換
-  std::unique_ptr<VideoFrame> nv12;
-  if (frame.format() != VideoPixelFormat::NV12) {
-    nv12 = frame.convert_format(VideoPixelFormat::NV12);
-  }
-  const VideoFrame& src = nv12 ? *nv12 : frame;
-
-  // スケーリングが必要かどうかを判定
-  bool needs_scaling =
-      (src.width() != config_.width || src.height() != config_.height);
-
-  // スケーリング用のバッファ (I420 経由でスケーリング)
-  std::vector<uint8_t> scaled_i420_buffer;
-  std::vector<uint8_t> scaled_nv12_buffer;
-  const uint8_t* final_y = src.plane_ptr(0);
-  const uint8_t* final_uv = src.plane_ptr(1);
-  uint32_t final_width = src.width();
-  uint32_t final_height = src.height();
-
-  if (needs_scaling) {
-    // NV12 -> I420 に変換してからスケーリング
-    size_t src_i420_size =
-        src.width() * src.height() + (src.width() / 2) * (src.height() / 2) * 2;
-    std::vector<uint8_t> src_i420_buffer(src_i420_size);
-
-    uint8_t* src_i420_y = src_i420_buffer.data();
-    uint8_t* src_i420_u = src_i420_y + src.width() * src.height();
-    uint8_t* src_i420_v = src_i420_u + (src.width() / 2) * (src.height() / 2);
-
-    libyuv::NV12ToI420(src.plane_ptr(0), src.width(), src.plane_ptr(1),
-                       src.width(), src_i420_y, src.width(), src_i420_u,
-                       src.width() / 2, src_i420_v, src.width() / 2,
-                       src.width(), src.height());
-
-    // I420 でスケーリング
-    uint32_t dst_width = config_.width;
-    uint32_t dst_height = config_.height;
-    size_t dst_i420_size =
-        dst_width * dst_height + (dst_width / 2) * (dst_height / 2) * 2;
-    scaled_i420_buffer.resize(dst_i420_size);
-
-    uint8_t* dst_i420_y = scaled_i420_buffer.data();
-    uint8_t* dst_i420_u = dst_i420_y + dst_width * dst_height;
-    uint8_t* dst_i420_v = dst_i420_u + (dst_width / 2) * (dst_height / 2);
-
-    int result = libyuv::I420Scale(
-        src_i420_y, src.width(), src_i420_u, src.width() / 2, src_i420_v,
-        src.width() / 2, src.width(), src.height(), dst_i420_y, dst_width,
-        dst_i420_u, dst_width / 2, dst_i420_v, dst_width / 2, dst_width,
-        dst_height, libyuv::kFilterBox);
-
-    if (result != 0) {
-      throw std::runtime_error("libyuv::I420Scale failed");
-    }
-
-    // I420 -> NV12 に変換
-    size_t nv12_size = dst_width * dst_height * 3 / 2;
-    scaled_nv12_buffer.resize(nv12_size);
+  // スケーリングと NV12 変換
+  auto scaled =
+      video_scaler::scale_to_nv12(frame, config_.width, config_.height);
 
-    uint8_t* nv12_y = scaled_nv12_buffer.data();
-    uint8_t* nv12_uv = nv12_y + dst_width * dst_height;
-
-    libyuv::I420ToNV12(dst_i420_y, dst_width, dst_i420_u, dst_width / 2,
-                       dst_i420_v, dst_width / 2, nv12_y, dst_width, nv12_uv,
-                       dst_width, dst_width, dst_height);
-
-    final_y = nv12_y;
-    final_uv = nv12_uv;
-    final_width = dst_width;
-    final_height = dst_height;
-  }
+  const uint8_t* final_y = scaled.y;
+  const uint8_t* final_uv = scaled.uv;
+  uint32_t final_width = scaled.width;
+  uint32_t final_height = scaled.height;
 
   // 入力バッファをロック
   NV_ENC_LOCK_INPUT_BUFFER lock_input_buffer = {};
diff --git a/src/bindings/video_encoder_vpx.cpp b/src/bindings/video_encoder_vpx.cpp
index bc27500..b8b6f7d 100644
--- a/src/bindings/video_encoder_vpx.cpp
+++ b/src/bindings/video_encoder_vpx.cpp
@@ -5,7 +5,7 @@
 #include <cstring>
 #include <thread>
 
-#include <libyuv.h>
+#include "video_scaler.h"
 
 // WebRTC の NumberOfThreads ロジックに準拠
 static int calculate_vpx_number_of_threads(int width,
@@ -284,53 +284,16 @@ void VideoEncoder::encode_frame_vpx(const VideoFrame& frame,
     vpx_codec_control(vpx_encoder_, VP9E_SET_SVC_LAYER_ID, &layer_id);
   }
 
-  // スケーリングが必要かどうかを判定
-  bool needs_scaling =
-      (frame.width() != config_.width || frame.height() != config_.height);
-
-  // スケーリング用のバッファ
-  std::vector<uint8_t> scaled_buffer;
-  const uint8_t* src_y = frame.plane_ptr(0);
-  const uint8_t* src_u = frame.plane_ptr(1);
-  const uint8_t* src_v = frame.plane_ptr(2);
-  int src_stride_y = static_cast<int>(frame.width());
-  int src_stride_u = static_cast<int>(frame.width() / 2);
-  int src_stride_v = static_cast<int>(frame.width() / 2);
-
-  // スケーリングが必要な場合は libyuv で変換
-  if (needs_scaling) {
-    uint32_t dst_width = config_.width;
-    uint32_t dst_height = config_.height;
-    size_t y_size = dst_width * dst_height;
-    size_t uv_size = (dst_width / 2) * (dst_height / 2);
-    scaled_buffer.resize(y_size + uv_size * 2);
-
-    uint8_t* dst_y = scaled_buffer.data();
-    uint8_t* dst_u = dst_y + y_size;
-    uint8_t* dst_v = dst_u + uv_size;
-    int dst_stride_y = static_cast<int>(dst_width);
-    int dst_stride_u = static_cast<int>(dst_width / 2);
-    int dst_stride_v = static_cast<int>(dst_width / 2);
-
-    int result = libyuv::I420Scale(
-        src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-        static_cast<int>(frame.width()), static_cast<int>(frame.height()),
-        dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
-        static_cast<int>(dst_width), static_cast<int>(dst_height),
-        libyuv::kFilterBox);
-
-    if (result != 0) {
-      throw std::runtime_error("libyuv::I420Scale failed");
-    }
+  // スケーリングと I420 変換
+  auto scaled =
+      video_scaler::scale_to_i420(frame, config_.width, config_.height);
 
-    // スケーリング後のポインタとストライドを更新
-    src_y = dst_y;
-    src_u = dst_u;
-    src_v = dst_v;
-    src_stride_y = dst_stride_y;
-    src_stride_u = dst_stride_u;
-    src_stride_v = dst_stride_v;
-  }
+  const uint8_t* src_y = scaled.y;
+  const uint8_t* src_u = scaled.u;
+  const uint8_t* src_v = scaled.v;
+  int src_stride_y = scaled.stride_y;
+  int src_stride_u = scaled.stride_u;
+  int src_stride_v = scaled.stride_v;
 
   // I420 イメージをラップ
   vpx_image_t img;
diff --git a/src/bindings/video_scaler.cpp b/src/bindings/video_scaler.cpp
new file mode 100644
index 0000000..6acf0c8
--- /dev/null
+++ b/src/bindings/video_scaler.cpp
@@ -0,0 +1,762 @@
+// スケーリングヘルパー関数の実装
+
+#include "video_scaler.h"
+
+#include <libyuv.h>
+#include <stdexcept>
+
+namespace video_scaler {
+
+namespace {
+
+// I420 スケーリング
+int scale_i420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               int src_width,
+               int src_height,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int dst_width,
+               int dst_height) {
+  return libyuv::I420Scale(
+      src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_width,
+      src_height, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+      dst_width, dst_height, libyuv::kFilterBox);
+}
+
+// I422 スケーリング
+int scale_i422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               int src_width,
+               int src_height,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int dst_width,
+               int dst_height) {
+  return libyuv::I422Scale(
+      src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_width,
+      src_height, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+      dst_width, dst_height, libyuv::kFilterBox);
+}
+
+// I444 スケーリング
+int scale_i444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               int src_width,
+               int src_height,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int dst_width,
+               int dst_height) {
+  return libyuv::I444Scale(
+      src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_width,
+      src_height, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
+      dst_width, dst_height, libyuv::kFilterBox);
+}
+
+// NV12 スケーリング
+int scale_nv12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               int src_width,
+               int src_height,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int dst_width,
+               int dst_height) {
+  return libyuv::NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv,
+                           src_width, src_height, dst_y, dst_stride_y, dst_uv,
+                           dst_stride_uv, dst_width, dst_height,
+                           libyuv::kFilterBox);
+}
+
+// ARGB スケーリング
+int scale_argb(const uint8_t* src_argb,
+               int src_stride_argb,
+               int src_width,
+               int src_height,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int dst_width,
+               int dst_height) {
+  return libyuv::ARGBScale(src_argb, src_stride_argb, src_width, src_height,
+                           dst_argb, dst_stride_argb, dst_width, dst_height,
+                           libyuv::kFilterBox);
+}
+
+}  // namespace
+
+I420ScaleResult scale_to_i420(const VideoFrame& frame,
+                              uint32_t dst_width,
+                              uint32_t dst_height) {
+  I420ScaleResult result;
+  result.width = dst_width;
+  result.height = dst_height;
+
+  bool needs_scaling =
+      (frame.width() != dst_width || frame.height() != dst_height);
+
+  size_t y_size = dst_width * dst_height;
+  size_t uv_size = (dst_width / 2) * (dst_height / 2);
+
+  // スケーリングが不要かつ I420 の場合は元フレームのポインタを返す
+  if (!needs_scaling && frame.format() == VideoPixelFormat::I420) {
+    result.y = frame.plane_ptr(0);
+    result.u = frame.plane_ptr(1);
+    result.v = frame.plane_ptr(2);
+    result.stride_y = static_cast<int>(dst_width);
+    result.stride_u = static_cast<int>(dst_width / 2);
+    result.stride_v = static_cast<int>(dst_width / 2);
+    return result;
+  }
+
+  // スケーリング用のバッファ
+  std::vector<uint8_t> scaled_buffer;
+  uint32_t current_width = frame.width();
+  uint32_t current_height = frame.height();
+  VideoPixelFormat current_format = frame.format();
+
+  // 1. スケーリング (入力フォーマットのまま)
+  if (needs_scaling) {
+    int scale_result = 0;
+
+    switch (frame.format()) {
+      case VideoPixelFormat::I420: {
+        size_t dst_y_size = dst_width * dst_height;
+        size_t dst_uv_size = (dst_width / 2) * (dst_height / 2);
+        scaled_buffer.resize(dst_y_size + dst_uv_size * 2);
+        scale_result = scale_i420(
+            frame.plane_ptr(0), static_cast<int>(frame.width()),
+            frame.plane_ptr(1), static_cast<int>(frame.width() / 2),
+            frame.plane_ptr(2), static_cast<int>(frame.width() / 2),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width),
+            scaled_buffer.data() + dst_y_size, static_cast<int>(dst_width / 2),
+            scaled_buffer.data() + dst_y_size + dst_uv_size,
+            static_cast<int>(dst_width / 2), static_cast<int>(dst_width),
+            static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::I422: {
+        size_t dst_y_size = dst_width * dst_height;
+        size_t dst_uv_size = (dst_width / 2) * dst_height;
+        scaled_buffer.resize(dst_y_size + dst_uv_size * 2);
+        scale_result = scale_i422(
+            frame.plane_ptr(0), static_cast<int>(frame.width()),
+            frame.plane_ptr(1), static_cast<int>(frame.width() / 2),
+            frame.plane_ptr(2), static_cast<int>(frame.width() / 2),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width),
+            scaled_buffer.data() + dst_y_size, static_cast<int>(dst_width / 2),
+            scaled_buffer.data() + dst_y_size + dst_uv_size,
+            static_cast<int>(dst_width / 2), static_cast<int>(dst_width),
+            static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::I444: {
+        size_t plane_size = dst_width * dst_height;
+        scaled_buffer.resize(plane_size * 3);
+        scale_result = scale_i444(
+            frame.plane_ptr(0), static_cast<int>(frame.width()),
+            frame.plane_ptr(1), static_cast<int>(frame.width()),
+            frame.plane_ptr(2), static_cast<int>(frame.width()),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width),
+            scaled_buffer.data() + plane_size, static_cast<int>(dst_width),
+            scaled_buffer.data() + plane_size * 2, static_cast<int>(dst_width),
+            static_cast<int>(dst_width), static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::NV12: {
+        size_t nv12_size = dst_width * dst_height * 3 / 2;
+        scaled_buffer.resize(nv12_size);
+        scale_result = scale_nv12(
+            frame.plane_ptr(0), static_cast<int>(frame.width()),
+            frame.plane_ptr(1), static_cast<int>(frame.width()),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width),
+            scaled_buffer.data() + dst_width * dst_height,
+            static_cast<int>(dst_width), static_cast<int>(dst_width),
+            static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::RGBA:
+      case VideoPixelFormat::BGRA: {
+        size_t argb_size = dst_width * dst_height * 4;
+        scaled_buffer.resize(argb_size);
+        scale_result = scale_argb(
+            frame.plane_ptr(0), static_cast<int>(frame.width() * 4),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width * 4),
+            static_cast<int>(dst_width), static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::RGB:
+      case VideoPixelFormat::BGR: {
+        // RGB/BGR は直接スケーリングできないため、I420 に変換してからスケーリング
+        size_t src_y_size = frame.width() * frame.height();
+        size_t src_uv_size = (frame.width() / 2) * (frame.height() / 2);
+        std::vector<uint8_t> src_i420(src_y_size + src_uv_size * 2);
+        uint8_t* src_i420_y = src_i420.data();
+        uint8_t* src_i420_u = src_i420_y + src_y_size;
+        uint8_t* src_i420_v = src_i420_u + src_uv_size;
+
+        if (frame.format() == VideoPixelFormat::RGB) {
+          libyuv::RGB24ToI420(frame.plane_ptr(0),
+                              static_cast<int>(frame.width() * 3), src_i420_y,
+                              static_cast<int>(frame.width()), src_i420_u,
+                              static_cast<int>(frame.width() / 2), src_i420_v,
+                              static_cast<int>(frame.width() / 2),
+                              static_cast<int>(frame.width()),
+                              static_cast<int>(frame.height()));
+        } else {
+          libyuv::RAWToI420(frame.plane_ptr(0),
+                            static_cast<int>(frame.width() * 3), src_i420_y,
+                            static_cast<int>(frame.width()), src_i420_u,
+                            static_cast<int>(frame.width() / 2), src_i420_v,
+                            static_cast<int>(frame.width() / 2),
+                            static_cast<int>(frame.width()),
+                            static_cast<int>(frame.height()));
+        }
+
+        // I420 でスケーリング
+        size_t dst_y_size = dst_width * dst_height;
+        size_t dst_uv_size = (dst_width / 2) * (dst_height / 2);
+        scaled_buffer.resize(dst_y_size + dst_uv_size * 2);
+
+        scale_result = scale_i420(
+            src_i420_y, static_cast<int>(frame.width()), src_i420_u,
+            static_cast<int>(frame.width() / 2), src_i420_v,
+            static_cast<int>(frame.width() / 2),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width),
+            scaled_buffer.data() + dst_y_size, static_cast<int>(dst_width / 2),
+            scaled_buffer.data() + dst_y_size + dst_uv_size,
+            static_cast<int>(dst_width / 2), static_cast<int>(dst_width),
+            static_cast<int>(dst_height));
+
+        // スケーリング後は I420
+        current_format = VideoPixelFormat::I420;
+        break;
+      }
+    }
+
+    if (scale_result != 0) {
+      throw std::runtime_error("libyuv scale failed");
+    }
+
+    current_width = dst_width;
+    current_height = dst_height;
+  }
+
+  // 2. I420 に変換
+  if (current_format == VideoPixelFormat::I420) {
+    // スケーリング済みの I420 バッファをそのまま使用
+    result.buffer = std::move(scaled_buffer);
+    result.y = result.buffer.data();
+    result.u = result.buffer.data() + y_size;
+    result.v = result.buffer.data() + y_size + uv_size;
+  } else {
+    // I420 以外のフォーマットは変換が必要
+    result.buffer.resize(y_size + uv_size * 2);
+    uint8_t* dst_y = result.buffer.data();
+    uint8_t* dst_u = dst_y + y_size;
+    uint8_t* dst_v = dst_u + uv_size;
+
+    const uint8_t* src_data =
+        needs_scaling ? scaled_buffer.data() : frame.plane_ptr(0);
+
+    switch (current_format) {
+      case VideoPixelFormat::I422: {
+        size_t src_uv_size = (current_width / 2) * current_height;
+        const uint8_t* src_u_ptr =
+            needs_scaling
+                ? scaled_buffer.data() + current_width * current_height
+                : frame.plane_ptr(1);
+        const uint8_t* src_v_ptr =
+            needs_scaling ? scaled_buffer.data() +
+                                current_width * current_height + src_uv_size
+                          : frame.plane_ptr(2);
+        libyuv::I422ToI420(src_data, static_cast<int>(current_width), src_u_ptr,
+                           static_cast<int>(current_width / 2), src_v_ptr,
+                           static_cast<int>(current_width / 2), dst_y,
+                           static_cast<int>(current_width), dst_u,
+                           static_cast<int>(current_width / 2), dst_v,
+                           static_cast<int>(current_width / 2),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::I444: {
+        size_t plane_size = current_width * current_height;
+        const uint8_t* src_u_ptr = needs_scaling
+                                       ? scaled_buffer.data() + plane_size
+                                       : frame.plane_ptr(1);
+        const uint8_t* src_v_ptr = needs_scaling
+                                       ? scaled_buffer.data() + plane_size * 2
+                                       : frame.plane_ptr(2);
+        libyuv::I444ToI420(src_data, static_cast<int>(current_width), src_u_ptr,
+                           static_cast<int>(current_width), src_v_ptr,
+                           static_cast<int>(current_width), dst_y,
+                           static_cast<int>(current_width), dst_u,
+                           static_cast<int>(current_width / 2), dst_v,
+                           static_cast<int>(current_width / 2),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::NV12: {
+        const uint8_t* src_uv =
+            needs_scaling
+                ? scaled_buffer.data() + current_width * current_height
+                : frame.plane_ptr(1);
+        libyuv::NV12ToI420(src_data, static_cast<int>(current_width), src_uv,
+                           static_cast<int>(current_width), dst_y,
+                           static_cast<int>(current_width), dst_u,
+                           static_cast<int>(current_width / 2), dst_v,
+                           static_cast<int>(current_width / 2),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::RGBA: {
+        libyuv::ABGRToI420(src_data, static_cast<int>(current_width * 4), dst_y,
+                           static_cast<int>(current_width), dst_u,
+                           static_cast<int>(current_width / 2), dst_v,
+                           static_cast<int>(current_width / 2),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::BGRA: {
+        libyuv::ARGBToI420(src_data, static_cast<int>(current_width * 4), dst_y,
+                           static_cast<int>(current_width), dst_u,
+                           static_cast<int>(current_width / 2), dst_v,
+                           static_cast<int>(current_width / 2),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::RGB: {
+        libyuv::RGB24ToI420(src_data, static_cast<int>(current_width * 3),
+                            dst_y, static_cast<int>(current_width), dst_u,
+                            static_cast<int>(current_width / 2), dst_v,
+                            static_cast<int>(current_width / 2),
+                            static_cast<int>(current_width),
+                            static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::BGR: {
+        libyuv::RAWToI420(src_data, static_cast<int>(current_width * 3), dst_y,
+                          static_cast<int>(current_width), dst_u,
+                          static_cast<int>(current_width / 2), dst_v,
+                          static_cast<int>(current_width / 2),
+                          static_cast<int>(current_width),
+                          static_cast<int>(current_height));
+        break;
+      }
+      default:
+        throw std::runtime_error(
+            "Unsupported pixel format for I420 conversion");
+    }
+
+    result.y = dst_y;
+    result.u = dst_u;
+    result.v = dst_v;
+  }
+
+  result.stride_y = static_cast<int>(dst_width);
+  result.stride_u = static_cast<int>(dst_width / 2);
+  result.stride_v = static_cast<int>(dst_width / 2);
+
+  return result;
+}
+
+NV12ScaleResult scale_to_nv12(const VideoFrame& frame,
+                              uint32_t dst_width,
+                              uint32_t dst_height) {
+  NV12ScaleResult result;
+  result.width = dst_width;
+  result.height = dst_height;
+
+  bool needs_scaling =
+      (frame.width() != dst_width || frame.height() != dst_height);
+
+  size_t nv12_size = dst_width * dst_height * 3 / 2;
+
+  // スケーリングが不要かつ NV12 の場合は元フレームのポインタを返す
+  if (!needs_scaling && frame.format() == VideoPixelFormat::NV12) {
+    result.y = frame.plane_ptr(0);
+    result.uv = frame.plane_ptr(1);
+    return result;
+  }
+
+  // スケーリング用のバッファ
+  std::vector<uint8_t> scaled_buffer;
+  uint32_t current_width = frame.width();
+  uint32_t current_height = frame.height();
+  VideoPixelFormat current_format = frame.format();
+
+  // 1. スケーリング (入力フォーマットのまま)
+  if (needs_scaling) {
+    int scale_result = 0;
+
+    switch (frame.format()) {
+      case VideoPixelFormat::I420: {
+        size_t dst_y_size = dst_width * dst_height;
+        size_t dst_uv_size = (dst_width / 2) * (dst_height / 2);
+        scaled_buffer.resize(dst_y_size + dst_uv_size * 2);
+        scale_result = scale_i420(
+            frame.plane_ptr(0), static_cast<int>(frame.width()),
+            frame.plane_ptr(1), static_cast<int>(frame.width() / 2),
+            frame.plane_ptr(2), static_cast<int>(frame.width() / 2),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width),
+            scaled_buffer.data() + dst_y_size, static_cast<int>(dst_width / 2),
+            scaled_buffer.data() + dst_y_size + dst_uv_size,
+            static_cast<int>(dst_width / 2), static_cast<int>(dst_width),
+            static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::I422: {
+        size_t dst_y_size = dst_width * dst_height;
+        size_t dst_uv_size = (dst_width / 2) * dst_height;
+        scaled_buffer.resize(dst_y_size + dst_uv_size * 2);
+        scale_result = scale_i422(
+            frame.plane_ptr(0), static_cast<int>(frame.width()),
+            frame.plane_ptr(1), static_cast<int>(frame.width() / 2),
+            frame.plane_ptr(2), static_cast<int>(frame.width() / 2),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width),
+            scaled_buffer.data() + dst_y_size, static_cast<int>(dst_width / 2),
+            scaled_buffer.data() + dst_y_size + dst_uv_size,
+            static_cast<int>(dst_width / 2), static_cast<int>(dst_width),
+            static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::I444: {
+        size_t plane_size = dst_width * dst_height;
+        scaled_buffer.resize(plane_size * 3);
+        scale_result = scale_i444(
+            frame.plane_ptr(0), static_cast<int>(frame.width()),
+            frame.plane_ptr(1), static_cast<int>(frame.width()),
+            frame.plane_ptr(2), static_cast<int>(frame.width()),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width),
+            scaled_buffer.data() + plane_size, static_cast<int>(dst_width),
+            scaled_buffer.data() + plane_size * 2, static_cast<int>(dst_width),
+            static_cast<int>(dst_width), static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::NV12: {
+        scaled_buffer.resize(nv12_size);
+        scale_result = scale_nv12(
+            frame.plane_ptr(0), static_cast<int>(frame.width()),
+            frame.plane_ptr(1), static_cast<int>(frame.width()),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width),
+            scaled_buffer.data() + dst_width * dst_height,
+            static_cast<int>(dst_width), static_cast<int>(dst_width),
+            static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::RGBA:
+      case VideoPixelFormat::BGRA: {
+        size_t argb_size = dst_width * dst_height * 4;
+        scaled_buffer.resize(argb_size);
+        scale_result = scale_argb(
+            frame.plane_ptr(0), static_cast<int>(frame.width() * 4),
+            static_cast<int>(frame.width()), static_cast<int>(frame.height()),
+            scaled_buffer.data(), static_cast<int>(dst_width * 4),
+            static_cast<int>(dst_width), static_cast<int>(dst_height));
+        break;
+      }
+      case VideoPixelFormat::RGB:
+      case VideoPixelFormat::BGR: {
+        // RGB/BGR は直接スケーリングできないため、NV12 に変換してからスケーリング
+        size_t src_i420_size = frame.width() * frame.height() * 3 / 2;
+        std::vector<uint8_t> src_i420(src_i420_size);
+        uint8_t* src_i420_y = src_i420.data();
+        uint8_t* src_i420_u = src_i420_y + frame.width() * frame.height();
+        uint8_t* src_i420_v =
+            src_i420_u + (frame.width() / 2) * (frame.height() / 2);
+
+        if (frame.format() == VideoPixelFormat::RGB) {
+          libyuv::RGB24ToI420(frame.plane_ptr(0),
+                              static_cast<int>(frame.width() * 3), src_i420_y,
+                              static_cast<int>(frame.width()), src_i420_u,
+                              static_cast<int>(frame.width() / 2), src_i420_v,
+                              static_cast<int>(frame.width() / 2),
+                              static_cast<int>(frame.width()),
+                              static_cast<int>(frame.height()));
+        } else {
+          libyuv::RAWToI420(frame.plane_ptr(0),
+                            static_cast<int>(frame.width() * 3), src_i420_y,
+                            static_cast<int>(frame.width()), src_i420_u,
+                            static_cast<int>(frame.width() / 2), src_i420_v,
+                            static_cast<int>(frame.width() / 2),
+                            static_cast<int>(frame.width()),
+                            static_cast<int>(frame.height()));
+        }
+
+        // I420 -> NV12
+        size_t src_nv12_size = frame.width() * frame.height() * 3 / 2;
+        std::vector<uint8_t> src_nv12(src_nv12_size);
+        uint8_t* src_nv12_y = src_nv12.data();
+        uint8_t* src_nv12_uv = src_nv12_y + frame.width() * frame.height();
+
+        libyuv::I420ToNV12(src_i420_y, static_cast<int>(frame.width()),
+                           src_i420_u, static_cast<int>(frame.width() / 2),
+                           src_i420_v, static_cast<int>(frame.width() / 2),
+                           src_nv12_y, static_cast<int>(frame.width()),
+                           src_nv12_uv, static_cast<int>(frame.width()),
+                           static_cast<int>(frame.width()),
+                           static_cast<int>(frame.height()));
+
+        // NV12 でスケーリング
+        scaled_buffer.resize(nv12_size);
+        scale_result = scale_nv12(
+            src_nv12_y, static_cast<int>(frame.width()), src_nv12_uv,
+            static_cast<int>(frame.width()), static_cast<int>(frame.width()),
+            static_cast<int>(frame.height()), scaled_buffer.data(),
+            static_cast<int>(dst_width),
+            scaled_buffer.data() + dst_width * dst_height,
+            static_cast<int>(dst_width), static_cast<int>(dst_width),
+            static_cast<int>(dst_height));
+
+        // スケーリング後は NV12
+        current_format = VideoPixelFormat::NV12;
+        break;
+      }
+    }
+
+    if (scale_result != 0) {
+      throw std::runtime_error("libyuv scale failed");
+    }
+
+    current_width = dst_width;
+    current_height = dst_height;
+  }
+
+  // 2. NV12 に変換
+  if (current_format == VideoPixelFormat::NV12) {
+    // スケーリング済みの NV12 バッファをそのまま使用
+    result.buffer = std::move(scaled_buffer);
+    result.y = result.buffer.data();
+    result.uv = result.buffer.data() + current_width * current_height;
+  } else {
+    // NV12 以外のフォーマットは変換が必要
+    result.buffer.resize(nv12_size);
+    uint8_t* dst_y = result.buffer.data();
+    uint8_t* dst_uv = dst_y + current_width * current_height;
+
+    const uint8_t* src_data =
+        needs_scaling ? scaled_buffer.data() : frame.plane_ptr(0);
+
+    switch (current_format) {
+      case VideoPixelFormat::I420: {
+        size_t y_size = current_width * current_height;
+        size_t uv_size = (current_width / 2) * (current_height / 2);
+        const uint8_t* src_u =
+            needs_scaling ? scaled_buffer.data() + y_size : frame.plane_ptr(1);
+        const uint8_t* src_v = needs_scaling
+                                   ? scaled_buffer.data() + y_size + uv_size
+                                   : frame.plane_ptr(2);
+        libyuv::I420ToNV12(src_data, static_cast<int>(current_width), src_u,
+                           static_cast<int>(current_width / 2), src_v,
+                           static_cast<int>(current_width / 2), dst_y,
+                           static_cast<int>(current_width), dst_uv,
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::I422: {
+        size_t y_size = current_width * current_height;
+        size_t uv_size = (current_width / 2) * current_height;
+        const uint8_t* src_u =
+            needs_scaling ? scaled_buffer.data() + y_size : frame.plane_ptr(1);
+        const uint8_t* src_v = needs_scaling
+                                   ? scaled_buffer.data() + y_size + uv_size
+                                   : frame.plane_ptr(2);
+        // I422 -> I420 -> NV12
+        size_t i420_size = current_width * current_height * 3 / 2;
+        std::vector<uint8_t> i420_tmp(i420_size);
+        uint8_t* i420_y = i420_tmp.data();
+        uint8_t* i420_u = i420_y + current_width * current_height;
+        uint8_t* i420_v = i420_u + (current_width / 2) * (current_height / 2);
+        libyuv::I422ToI420(src_data, static_cast<int>(current_width), src_u,
+                           static_cast<int>(current_width / 2), src_v,
+                           static_cast<int>(current_width / 2), i420_y,
+                           static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        libyuv::I420ToNV12(i420_y, static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2), dst_y,
+                           static_cast<int>(current_width), dst_uv,
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::I444: {
+        size_t plane_size = current_width * current_height;
+        const uint8_t* src_u = needs_scaling ? scaled_buffer.data() + plane_size
+                                             : frame.plane_ptr(1);
+        const uint8_t* src_v = needs_scaling
+                                   ? scaled_buffer.data() + plane_size * 2
+                                   : frame.plane_ptr(2);
+        // I444 -> I420 -> NV12
+        size_t i420_size = current_width * current_height * 3 / 2;
+        std::vector<uint8_t> i420_tmp(i420_size);
+        uint8_t* i420_y = i420_tmp.data();
+        uint8_t* i420_u = i420_y + current_width * current_height;
+        uint8_t* i420_v = i420_u + (current_width / 2) * (current_height / 2);
+        libyuv::I444ToI420(src_data, static_cast<int>(current_width), src_u,
+                           static_cast<int>(current_width), src_v,
+                           static_cast<int>(current_width), i420_y,
+                           static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        libyuv::I420ToNV12(i420_y, static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2), dst_y,
+                           static_cast<int>(current_width), dst_uv,
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::RGBA: {
+        // RGBA -> I420 -> NV12
+        size_t i420_size = current_width * current_height * 3 / 2;
+        std::vector<uint8_t> i420_tmp(i420_size);
+        uint8_t* i420_y = i420_tmp.data();
+        uint8_t* i420_u = i420_y + current_width * current_height;
+        uint8_t* i420_v = i420_u + (current_width / 2) * (current_height / 2);
+        libyuv::ABGRToI420(src_data, static_cast<int>(current_width * 4),
+                           i420_y, static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        libyuv::I420ToNV12(i420_y, static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2), dst_y,
+                           static_cast<int>(current_width), dst_uv,
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::BGRA: {
+        // BGRA -> I420 -> NV12
+        size_t i420_size = current_width * current_height * 3 / 2;
+        std::vector<uint8_t> i420_tmp(i420_size);
+        uint8_t* i420_y = i420_tmp.data();
+        uint8_t* i420_u = i420_y + current_width * current_height;
+        uint8_t* i420_v = i420_u + (current_width / 2) * (current_height / 2);
+        libyuv::ARGBToI420(src_data, static_cast<int>(current_width * 4),
+                           i420_y, static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        libyuv::I420ToNV12(i420_y, static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2), dst_y,
+                           static_cast<int>(current_width), dst_uv,
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::RGB: {
+        // RGB -> I420 -> NV12
+        size_t i420_size = current_width * current_height * 3 / 2;
+        std::vector<uint8_t> i420_tmp(i420_size);
+        uint8_t* i420_y = i420_tmp.data();
+        uint8_t* i420_u = i420_y + current_width * current_height;
+        uint8_t* i420_v = i420_u + (current_width / 2) * (current_height / 2);
+        libyuv::RGB24ToI420(src_data, static_cast<int>(current_width * 3),
+                            i420_y, static_cast<int>(current_width), i420_u,
+                            static_cast<int>(current_width / 2), i420_v,
+                            static_cast<int>(current_width / 2),
+                            static_cast<int>(current_width),
+                            static_cast<int>(current_height));
+        libyuv::I420ToNV12(i420_y, static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2), dst_y,
+                           static_cast<int>(current_width), dst_uv,
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      case VideoPixelFormat::BGR: {
+        // BGR -> I420 -> NV12
+        size_t i420_size = current_width * current_height * 3 / 2;
+        std::vector<uint8_t> i420_tmp(i420_size);
+        uint8_t* i420_y = i420_tmp.data();
+        uint8_t* i420_u = i420_y + current_width * current_height;
+        uint8_t* i420_v = i420_u + (current_width / 2) * (current_height / 2);
+        libyuv::RAWToI420(src_data, static_cast<int>(current_width * 3), i420_y,
+                          static_cast<int>(current_width), i420_u,
+                          static_cast<int>(current_width / 2), i420_v,
+                          static_cast<int>(current_width / 2),
+                          static_cast<int>(current_width),
+                          static_cast<int>(current_height));
+        libyuv::I420ToNV12(i420_y, static_cast<int>(current_width), i420_u,
+                           static_cast<int>(current_width / 2), i420_v,
+                           static_cast<int>(current_width / 2), dst_y,
+                           static_cast<int>(current_width), dst_uv,
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_width),
+                           static_cast<int>(current_height));
+        break;
+      }
+      default:
+        throw std::runtime_error(
+            "Unsupported pixel format for NV12 conversion");
+    }
+
+    result.y = dst_y;
+    result.uv = dst_uv;
+  }
+
+  return result;
+}
+
+}  // namespace video_scaler
diff --git a/src/bindings/video_scaler.h b/src/bindings/video_scaler.h
new file mode 100644
index 0000000..df09c54
--- /dev/null
+++ b/src/bindings/video_scaler.h
@@ -0,0 +1,65 @@
+// スケーリングヘルパー関数
+// VideoFrame を指定サイズにスケーリングし、I420 または NV12 形式で返す
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "video_frame.h"
+
+namespace video_scaler {
+
+// I420 形式のスケーリング結果
+// AOM, VPX エンコーダー用
+struct I420ScaleResult {
+  // スケーリング用バッファ
+  // スケーリング不要の場合は空
+  std::vector<uint8_t> buffer;
+
+  // 出力サイズ
+  uint32_t width;
+  uint32_t height;
+
+  // Y, U, V プレーンへのポインタ
+  // buffer が空の場合は元フレームのポインタ
+  const uint8_t* y;
+  const uint8_t* u;
+  const uint8_t* v;
+
+  // ストライド
+  int stride_y;
+  int stride_u;
+  int stride_v;
+};
+
+// NV12 形式のスケーリング結果
+// NVENC, Intel VPL エンコーダー用
+struct NV12ScaleResult {
+  // スケーリング用バッファ
+  // スケーリング不要の場合は空
+  std::vector<uint8_t> buffer;
+
+  // 出力サイズ
+  uint32_t width;
+  uint32_t height;
+
+  // Y, UV プレーンへのポインタ
+  // buffer が空の場合は元フレームのポインタ
+  const uint8_t* y;
+  const uint8_t* uv;
+};
+
+// フレームを I420 形式にスケーリング/変換
+// スケーリング不要かつ入力が I420 の場合は元フレームのポインタを返す
+I420ScaleResult scale_to_i420(const VideoFrame& frame,
+                              uint32_t dst_width,
+                              uint32_t dst_height);
+
+// フレームを NV12 形式にスケーリング/変換
+// スケーリング不要かつ入力が NV12 の場合は元フレームのポインタを返す
+NV12ScaleResult scale_to_nv12(const VideoFrame& frame,
+                              uint32_t dst_width,
+                              uint32_t dst_height);
+
+}  // namespace video_scaler
diff --git a/tests/test_apple_video_toolbox.py b/tests/test_apple_video_toolbox.py
index a889402..d090ca4 100644
--- a/tests/test_apple_video_toolbox.py
+++ b/tests/test_apple_video_toolbox.py
@@ -1890,115 +1890,73 @@ def on_decode_error(error):
 
 # =============================================================================
 # スケーリングテスト (VTPixelTransferSession)
+#
+# WebCodecs API 仕様: 「The encoder MUST scale any VideoFrame whose
+# visible width differs from the configured width value」
+#
+# Apple Video Toolbox がサポートするピクセルフォーマット: I420, NV12, BGRA
 # =============================================================================
 
-
-def test_h264_encode_with_scaling():
-    """configure と異なる解像度のフレームを H.264 でエンコードする (スケーリング).
-
-    WebCodecs API 仕様: 「The encoder MUST scale any VideoFrame whose
-    visible width differs from the configured width value」
-    """
-    encoded_chunks = []
-
-    def on_output(chunk):
-        encoded_chunks.append(chunk)
-
-    def on_error(error):
-        pytest.fail(f"Encoder error: {error}")
-
-    encoder = VideoEncoder(on_output, on_error)
-
-    # configure: 640x360 (出力解像度)
-    config: VideoEncoderConfig = {
-        "codec": "avc1.42E01E",
-        "width": 640,
-        "height": 360,
-        "bitrate": 1_000_000,
-        "framerate": 30,
-        "latency_mode": LatencyMode.REALTIME,
-        "hardware_acceleration_engine": HardwareAccelerationEngine.APPLE_VIDEO_TOOLBOX,
-        "avc": {"format": "annexb"},
-    }
-
-    encoder.configure(config)
-
-    # encode: 1280x720 のフレーム (入力解像度)
-    input_width, input_height = 1280, 720
-    data_size = input_width * input_height * 3 // 2  # I420
-    test_frames = []
-
-    for i in range(5):
-        data = np.full(data_size, (i * 50) % 256, dtype=np.uint8)
-        init: VideoFrameBufferInit = {
-            "format": VideoPixelFormat.I420,
-            "coded_width": input_width,
-            "coded_height": input_height,
-            "timestamp": i * 33333,
-        }
-        frame = VideoFrame(data, init)
-        test_frames.append(frame)
-        encoder.encode(frame, {"key_frame": i == 0})
-
-    encoder.flush()
-
-    # エンコードが成功していることを確認
-    assert len(encoded_chunks) >= 5, (
-        f"5 フレームがエンコードされるべき、実際: {len(encoded_chunks)}"
-    )
-
-    # 最初のチャンクがキーフレームであることを確認
-    assert encoded_chunks[0].type == EncodedVideoChunkType.KEY
-
-    # デコードして出力解像度を確認
-    decoded_frames = []
-
-    def on_decode_output(frame):
-        decoded_frames.append(frame)
-
-    def on_decode_error(error):
-        pytest.fail(f"Decoder error: {error}")
-
-    decoder = VideoDecoder(on_decode_output, on_decode_error)
-
-    decoder_config: VideoDecoderConfig = {
-        "codec": "avc1.42E01E",
-        "coded_width": 640,
-        "coded_height": 360,
+# スケーリングテスト用のピクセルフォーマット (Apple Video Toolbox がサポートするもののみ)
+SCALING_PIXEL_FORMATS = [
+    VideoPixelFormat.I420,
+    VideoPixelFormat.NV12,
+    VideoPixelFormat.BGRA,
+]
+
+# 同一解像度テスト用のピクセルフォーマット (BGRA はスケーリング時のみサポート)
+SAME_RESOLUTION_PIXEL_FORMATS = [
+    VideoPixelFormat.I420,
+    VideoPixelFormat.NV12,
+]
+
+
+def _calculate_scaling_frame_data_size(
+    width: int, height: int, pixel_format: VideoPixelFormat
+) -> int:
+    """スケーリングテスト用のピクセルフォーマットに応じたデータサイズを計算する."""
+    match pixel_format:
+        case VideoPixelFormat.I420 | VideoPixelFormat.NV12:
+            return width * height * 3 // 2
+        case VideoPixelFormat.BGRA:
+            return width * height * 4
+        case _:
+            raise ValueError(f"Unsupported pixel format: {pixel_format}")
+
+
+def _make_scaling_test_frame(
+    width: int,
+    height: int,
+    frame_num: int = 0,
+    pixel_format: VideoPixelFormat = VideoPixelFormat.I420,
+) -> VideoFrame:
+    """スケーリングテスト用の VideoFrame を作成する."""
+    data_size = _calculate_scaling_frame_data_size(width, height, pixel_format)
+    data = np.zeros(data_size, dtype=np.uint8)
+    init: VideoFrameBufferInit = {
+        "format": pixel_format,
+        "coded_width": width,
+        "coded_height": height,
+        "timestamp": frame_num * 33333,
     }
-    decoder.configure(decoder_config)
-
-    for chunk in encoded_chunks:
-        decoder.decode(chunk)
-
-    decoder.flush()
-
-    # デコードされたフレームが出力解像度になっていることを確認
-    assert len(decoded_frames) >= 1
-    for frame in decoded_frames:
-        assert frame.coded_width == 640, (
-            f"出力幅が期待値と異なる: 期待値 640, 実際 {frame.coded_width}"
-        )
-        assert frame.coded_height == 360, (
-            f"出力高さが期待値と異なる: 期待値 360, 実際 {frame.coded_height}"
-        )
-
-    print(
-        f"スケーリングテスト成功: 入力 {input_width}x{input_height} -> "
-        f"出力 640x360, エンコードチャンク数: {len(encoded_chunks)}"
-    )
+    return VideoFrame(data, init)
 
-    # クリーンアップ
-    for frame in test_frames:
-        frame.close()
-    for frame in decoded_frames:
-        frame.close()
-    encoder.close()
-    decoder.close()
 
+@pytest.mark.parametrize(
+    "codec",
+    [
+        pytest.param("avc1.42E01E", id="H264"),
+        pytest.param("hvc1.1.6.L93.B0", id="HEVC"),
+    ],
+)
+@pytest.mark.parametrize("pixel_format", SCALING_PIXEL_FORMATS)
+def test_encode_with_scaling(codec: str, pixel_format: VideoPixelFormat):
+    """エンコーダのスケーリング機能テスト (各コーデック・各ピクセルフォーマット)."""
+    # configure: 320x240 (出力解像度)
+    output_width, output_height = 320, 240
+    # encode: 640x480 のフレーム (入力解像度)
+    input_width, input_height = 640, 480
 
-def test_hevc_encode_with_scaling():
-    """configure と異なる解像度のフレームを HEVC でエンコードする (スケーリング)."""
     encoded_chunks = []
 
     def on_output(chunk):
@@ -2009,43 +1967,32 @@ def on_error(error):
 
     encoder = VideoEncoder(on_output, on_error)
 
-    # configure: 640x480 (出力解像度)
+    # コーデックごとのフォーマット設定
     config: VideoEncoderConfig = {
-        "codec": "hvc1.1.6.L93.B0",
-        "width": 640,
-        "height": 480,
-        "bitrate": 1_000_000,
+        "codec": codec,
+        "width": output_width,
+        "height": output_height,
+        "bitrate": 500_000,
         "framerate": 30,
         "latency_mode": LatencyMode.REALTIME,
         "hardware_acceleration_engine": HardwareAccelerationEngine.APPLE_VIDEO_TOOLBOX,
-        "hevc": {"format": "annexb"},
     }
-
+    if codec.startswith("avc"):
+        config["avc"] = {"format": "annexb"}
+    elif codec.startswith("hvc"):
+        config["hevc"] = {"format": "annexb"}
     encoder.configure(config)
 
-    # encode: 1920x1080 のフレーム (入力解像度)
-    input_width, input_height = 1920, 1080
-    data_size = input_width * input_height * 3 // 2  # I420
-    test_frames = []
-
-    for i in range(3):
-        data = np.full(data_size, (i * 80) % 256, dtype=np.uint8)
-        init: VideoFrameBufferInit = {
-            "format": VideoPixelFormat.I420,
-            "coded_width": input_width,
-            "coded_height": input_height,
-            "timestamp": i * 33333,
-        }
-        frame = VideoFrame(data, init)
-        test_frames.append(frame)
-        encoder.encode(frame, {"key_frame": i == 0})
-
+    # 入力解像度のフレームを作成
+    frame = _make_scaling_test_frame(input_width, input_height, 0, pixel_format)
+    encoder.encode(frame, {"key_frame": True})
     encoder.flush()
+    frame.close()
 
     # エンコードが成功していることを確認
-    assert len(encoded_chunks) >= 3, (
-        f"3 フレームがエンコードされるべき、実際: {len(encoded_chunks)}"
-    )
+    assert len(encoded_chunks) >= 1
+    assert encoded_chunks[0].byte_length > 0
+    assert encoded_chunks[0].type == EncodedVideoChunkType.KEY
 
     # デコードして出力解像度を確認
     decoded_frames = []
@@ -2059,94 +2006,39 @@ def on_decode_error(error):
     decoder = VideoDecoder(on_decode_output, on_decode_error)
 
     decoder_config: VideoDecoderConfig = {
-        "codec": "hvc1.1.6.L93.B0",
-        "coded_width": 640,
-        "coded_height": 480,
+        "codec": codec,
+        "coded_width": output_width,
+        "coded_height": output_height,
     }
     decoder.configure(decoder_config)
 
     for chunk in encoded_chunks:
         decoder.decode(chunk)
-
     decoder.flush()
 
     # デコードされたフレームが出力解像度になっていることを確認
     assert len(decoded_frames) >= 1
     for frame in decoded_frames:
-        assert frame.coded_width == 640, (
-            f"出力幅が期待値と異なる: 期待値 640, 実際 {frame.coded_width}"
-        )
-        assert frame.coded_height == 480, (
-            f"出力高さが期待値と異なる: 期待値 480, 実際 {frame.coded_height}"
-        )
-
-    print(
-        f"HEVC スケーリングテスト成功: 入力 {input_width}x{input_height} -> "
-        f"出力 640x480, エンコードチャンク数: {len(encoded_chunks)}"
-    )
-
-    # クリーンアップ
-    for frame in test_frames:
-        frame.close()
-    for frame in decoded_frames:
+        assert frame.coded_width == output_width
+        assert frame.coded_height == output_height
         frame.close()
+
     encoder.close()
     decoder.close()
 
 
-def test_scaling_with_nv12_input():
-    """NV12 形式の入力フレームでスケーリングが動作することを確認."""
-    encoded_chunks = []
-
-    def on_output(chunk):
-        encoded_chunks.append(chunk)
-
-    def on_error(error):
-        pytest.fail(f"Encoder error: {error}")
-
-    encoder = VideoEncoder(on_output, on_error)
-
-    # configure: 320x240 (出力解像度)
-    config: VideoEncoderConfig = {
-        "codec": "avc1.42E01E",
-        "width": 320,
-        "height": 240,
-        "bitrate": 500_000,
-        "framerate": 30,
-        "latency_mode": LatencyMode.REALTIME,
-        "hardware_acceleration_engine": HardwareAccelerationEngine.APPLE_VIDEO_TOOLBOX,
-    }
-
-    encoder.configure(config)
-
-    # encode: 640x480 の NV12 フレーム (入力解像度)
-    input_width, input_height = 640, 480
-    data_size = input_width * input_height * 3 // 2  # NV12
-
-    data = np.zeros(data_size, dtype=np.uint8)
-    init: VideoFrameBufferInit = {
-        "format": VideoPixelFormat.NV12,
-        "coded_width": input_width,
-        "coded_height": input_height,
-        "timestamp": 0,
-    }
-    frame = VideoFrame(data, init)
-    encoder.encode(frame, {"key_frame": True})
-
-    encoder.flush()
-
-    # エンコードが成功していることを確認
-    assert len(encoded_chunks) >= 1, "NV12 スケーリングエンコードに失敗"
-
-    print(f"NV12 スケーリングテスト成功: 入力 {input_width}x{input_height} -> 出力 320x240")
-
-    # クリーンアップ
-    frame.close()
-    encoder.close()
-
+@pytest.mark.parametrize(
+    "codec",
+    [
+        pytest.param("avc1.42E01E", id="H264"),
+        pytest.param("hvc1.1.6.L93.B0", id="HEVC"),
+    ],
+)
+@pytest.mark.parametrize("pixel_format", SAME_RESOLUTION_PIXEL_FORMATS)
+def test_encode_scaling_same_resolution(codec: str, pixel_format: VideoPixelFormat):
+    """configure と同じ解像度のフレームはスケーリングなしでエンコード (各コーデック・各ピクセルフォーマット)."""
+    width, height = 320, 240
 
-def test_scaling_same_resolution():
-    """configure と同じ解像度のフレームはスケーリングなしでエンコードされることを確認."""
     encoded_chunks = []
 
     def on_output(chunk):
@@ -2157,40 +2049,25 @@ def on_error(error):
 
     encoder = VideoEncoder(on_output, on_error)
 
-    # configure と encode で同じ解像度
     config: VideoEncoderConfig = {
-        "codec": "avc1.42E01E",
-        "width": 640,
-        "height": 480,
-        "bitrate": 1_000_000,
+        "codec": codec,
+        "width": width,
+        "height": height,
+        "bitrate": 500_000,
         "framerate": 30,
         "latency_mode": LatencyMode.REALTIME,
         "hardware_acceleration_engine": HardwareAccelerationEngine.APPLE_VIDEO_TOOLBOX,
     }
-
     encoder.configure(config)
 
     # 同じ解像度のフレーム
-    width, height = 640, 480
-    data_size = width * height * 3 // 2  # I420
-
-    data = np.zeros(data_size, dtype=np.uint8)
-    init: VideoFrameBufferInit = {
-        "format": VideoPixelFormat.I420,
-        "coded_width": width,
-        "coded_height": height,
-        "timestamp": 0,
-    }
-    frame = VideoFrame(data, init)
+    frame = _make_scaling_test_frame(width, height, 0, pixel_format)
     encoder.encode(frame, {"key_frame": True})
-
     encoder.flush()
+    frame.close()
 
     # エンコードが成功していることを確認
-    assert len(encoded_chunks) >= 1, "同一解像度エンコードに失敗"
+    assert len(encoded_chunks) >= 1
+    assert encoded_chunks[0].byte_length > 0
 
-    print("同一解像度テスト成功: スケーリングなしでエンコード")
-
-    # クリーンアップ
-    frame.close()
     encoder.close()
diff --git a/tests/test_encoder_scaling.py b/tests/test_encoder_scaling.py
index 3234861..aed825b 100644
--- a/tests/test_encoder_scaling.py
+++ b/tests/test_encoder_scaling.py
@@ -6,6 +6,12 @@
 - ソフトウェアエンコーダー (AV1, VP8, VP9): libyuv を使用
 - ハードウェアエンコーダー (NVENC, Intel VPL): libyuv を使用
 - Apple Video Toolbox: VTPixelTransferSession を使用 (test_apple_video_toolbox.py)
+
+テストデータについて:
+    このテストでは全てのピクセルフォーマット (I420, I422, I444, NV12, RGBA, BGRA, RGB, BGR)
+    に対してスケーリング機能をテストする。テストフレームのデータは全てゼロ (黒) だが、
+    各フォーマットに応じた正しいサイズで生成される。VideoFrame はサイズと format 指定に
+    基づいてデータを解釈するため、スケーリング機能のテストとしてはサイズが正しければ十分。
 """
 
 import platform
@@ -14,7 +20,6 @@
 import pytest
 
 from webcodecs import (
-    CodecState,
     EncodedVideoChunkType,
     LatencyMode,
     VideoDecoder,
@@ -27,12 +32,32 @@
 )
 
 
-def _make_test_frame(width: int, height: int, frame_num: int = 0) -> VideoFrame:
+def _calculate_frame_data_size(width: int, height: int, pixel_format: VideoPixelFormat) -> int:
+    """ピクセルフォーマットに応じたデータサイズを計算する."""
+    match pixel_format:
+        case VideoPixelFormat.I420 | VideoPixelFormat.NV12:
+            return width * height * 3 // 2
+        case VideoPixelFormat.I422:
+            return width * height * 2
+        case VideoPixelFormat.I444 | VideoPixelFormat.RGB | VideoPixelFormat.BGR:
+            return width * height * 3
+        case VideoPixelFormat.RGBA | VideoPixelFormat.BGRA:
+            return width * height * 4
+        case _:
+            raise ValueError(f"Unsupported pixel format: {pixel_format}")
+
+
+def _make_test_frame(
+    width: int,
+    height: int,
+    frame_num: int = 0,
+    pixel_format: VideoPixelFormat = VideoPixelFormat.I420,
+) -> VideoFrame:
     """テスト用の VideoFrame を作成する."""
-    data_size = width * height * 3 // 2  # I420
+    data_size = _calculate_frame_data_size(width, height, pixel_format)
     data = np.zeros(data_size, dtype=np.uint8)
     init: VideoFrameBufferInit = {
-        "format": VideoPixelFormat.I420,
+        "format": pixel_format,
         "coded_width": width,
         "coded_height": height,
         "timestamp": frame_num * 1000,
@@ -42,12 +67,34 @@ def _make_test_frame(width: int, height: int, frame_num: int = 0) -> VideoFrame:
 
 
 # =============================================================================
-# AV1 スケーリングテスト
+# スケーリングテスト (全コーデック共通)
 # =============================================================================
 
-
-def test_av1_encode_with_scaling():
-    """AV1 エンコーダのスケーリング機能テスト."""
+CODECS = [
+    pytest.param("av01.0.04M.08", id="AV1"),
+    pytest.param(
+        "vp8",
+        marks=pytest.mark.skipif(
+            platform.system() not in ("Darwin", "Linux"),
+            reason="VP8 は macOS / Linux のみサポート",
+        ),
+        id="VP8",
+    ),
+    pytest.param(
+        "vp09.00.10.08",
+        marks=pytest.mark.skipif(
+            platform.system() not in ("Darwin", "Linux"),
+            reason="VP9 は macOS / Linux のみサポート",
+        ),
+        id="VP9",
+    ),
+]
+
+
+@pytest.mark.parametrize("codec", CODECS)
+@pytest.mark.parametrize("pixel_format", VideoPixelFormat)
+def test_encode_with_scaling(codec: str, pixel_format: VideoPixelFormat):
+    """エンコーダのスケーリング機能テスト (各コーデック・各ピクセルフォーマット)."""
     # configure: 320x240 (出力解像度)
     output_width, output_height = 320, 240
     # encode: 640x480 のフレーム (入力解像度)
@@ -64,7 +111,7 @@ def on_error(error):
     encoder = VideoEncoder(on_output, on_error)
 
     config: VideoEncoderConfig = {
-        "codec": "av01.0.04M.08",
+        "codec": codec,
         "width": output_width,
         "height": output_height,
         "bitrate": 500_000,
@@ -74,7 +121,7 @@ def on_error(error):
     encoder.configure(config)
 
     # 入力解像度のフレームを作成
-    frame = _make_test_frame(input_width, input_height, 0)
+    frame = _make_test_frame(input_width, input_height, 0, pixel_format)
     encoder.encode(frame, {"key_frame": True})
     encoder.flush()
     frame.close()
@@ -95,7 +142,7 @@ def on_decode_error(error):
 
     decoder = VideoDecoder(on_decode_output, on_decode_error)
 
-    decoder_config: VideoDecoderConfig = {"codec": "av01.0.04M.08"}
+    decoder_config: VideoDecoderConfig = {"codec": codec}
     decoder.configure(decoder_config)
 
     for chunk in encoded_chunks:
@@ -105,20 +152,18 @@ def on_decode_error(error):
     # デコードされたフレームが出力解像度になっていることを確認
     assert len(decoded_frames) >= 1
     for frame in decoded_frames:
-        assert frame.coded_width == output_width, (
-            f"出力幅が期待値と異なる: 期待値 {output_width}, 実際 {frame.coded_width}"
-        )
-        assert frame.coded_height == output_height, (
-            f"出力高さが期待値と異なる: 期待値 {output_height}, 実際 {frame.coded_height}"
-        )
+        assert frame.coded_width == output_width
+        assert frame.coded_height == output_height
         frame.close()
 
     encoder.close()
     decoder.close()
 
 
-def test_av1_encode_scaling_same_resolution():
-    """AV1 configure と同じ解像度のフレームはスケーリングなしでエンコードされることを確認."""
+@pytest.mark.parametrize("codec", CODECS)
+@pytest.mark.parametrize("pixel_format", VideoPixelFormat)
+def test_encode_scaling_same_resolution(codec: str, pixel_format: VideoPixelFormat):
+    """configure と同じ解像度のフレームはスケーリングなしでエンコード (各コーデック・各ピクセルフォーマット)."""
     width, height = 320, 240
 
     encoded_chunks = []
@@ -132,7 +177,7 @@ def on_error(error):
     encoder = VideoEncoder(on_output, on_error)
 
     config: VideoEncoderConfig = {
-        "codec": "av01.0.04M.08",
+        "codec": codec,
         "width": width,
         "height": height,
         "bitrate": 500_000,
@@ -142,98 +187,7 @@ def on_error(error):
     encoder.configure(config)
 
     # 同じ解像度のフレーム
-    frame = _make_test_frame(width, height, 0)
-    encoder.encode(frame, {"key_frame": True})
-    encoder.flush()
-    frame.close()
-
-    # エンコードが成功していることを確認
-    assert len(encoded_chunks) >= 1
-    assert encoded_chunks[0].byte_length > 0
-
-    encoder.close()
-
-
-def test_av1_encode_scaling_multiple_frames():
-    """AV1 複数フレームでのスケーリングテスト."""
-    # configure: 320x240 (出力解像度)
-    output_width, output_height = 320, 240
-    # encode: 640x480 のフレーム (入力解像度)
-    input_width, input_height = 640, 480
-    num_frames = 3
-
-    encoded_chunks = []
-
-    def on_output(chunk):
-        encoded_chunks.append(chunk)
-
-    def on_error(error):
-        pytest.fail(f"Encoder error: {error}")
-
-    encoder = VideoEncoder(on_output, on_error)
-
-    config: VideoEncoderConfig = {
-        "codec": "av01.0.04M.08",
-        "width": output_width,
-        "height": output_height,
-        "bitrate": 500_000,
-        "framerate": 30.0,
-        "latency_mode": LatencyMode.REALTIME,
-    }
-    encoder.configure(config)
-
-    # 入力解像度のフレームを複数作成・エンコード
-    for i in range(num_frames):
-        frame = _make_test_frame(input_width, input_height, i)
-        encoder.encode(frame, {"key_frame": i == 0})
-        frame.close()
-
-    encoder.flush()
-
-    # エンコードが成功していることを確認
-    assert len(encoded_chunks) >= num_frames
-
-    encoder.close()
-
-
-# =============================================================================
-# VP8 スケーリングテスト
-# =============================================================================
-
-
-@pytest.mark.skipif(
-    platform.system() not in ("Darwin", "Linux"),
-    reason="VP8 は macOS / Linux のみサポート",
-)
-def test_vp8_encode_with_scaling():
-    """VP8 エンコーダのスケーリング機能テスト."""
-    # configure: 320x240 (出力解像度)
-    output_width, output_height = 320, 240
-    # encode: 640x480 のフレーム (入力解像度)
-    input_width, input_height = 640, 480
-
-    encoded_chunks = []
-
-    def on_output(chunk):
-        encoded_chunks.append(chunk)
-
-    def on_error(error):
-        pytest.fail(f"Encoder error: {error}")
-
-    encoder = VideoEncoder(on_output, on_error)
-
-    config: VideoEncoderConfig = {
-        "codec": "vp8",
-        "width": output_width,
-        "height": output_height,
-        "bitrate": 500_000,
-        "framerate": 30.0,
-        "latency_mode": LatencyMode.REALTIME,
-    }
-    encoder.configure(config)
-
-    # 入力解像度のフレームを作成
-    frame = _make_test_frame(input_width, input_height, 0)
+    frame = _make_test_frame(width, height, 0, pixel_format)
     encoder.encode(frame, {"key_frame": True})
     encoder.flush()
     frame.close()
@@ -241,209 +195,14 @@ def on_error(error):
     # エンコードが成功していることを確認
     assert len(encoded_chunks) >= 1
     assert encoded_chunks[0].byte_length > 0
-    assert encoded_chunks[0].type == EncodedVideoChunkType.KEY
-
-    # デコードして出力解像度を確認
-    decoded_frames = []
-
-    def on_decode_output(frame):
-        decoded_frames.append(frame)
-
-    def on_decode_error(error):
-        pytest.fail(f"Decoder error: {error}")
-
-    decoder = VideoDecoder(on_decode_output, on_decode_error)
-
-    decoder_config: VideoDecoderConfig = {"codec": "vp8"}
-    decoder.configure(decoder_config)
-
-    for chunk in encoded_chunks:
-        decoder.decode(chunk)
-    decoder.flush()
-
-    # デコードされたフレームが出力解像度になっていることを確認
-    assert len(decoded_frames) >= 1
-    for frame in decoded_frames:
-        assert frame.coded_width == output_width, (
-            f"出力幅が期待値と異なる: 期待値 {output_width}, 実際 {frame.coded_width}"
-        )
-        assert frame.coded_height == output_height, (
-            f"出力高さが期待値と異なる: 期待値 {output_height}, 実際 {frame.coded_height}"
-        )
-        frame.close()
 
     encoder.close()
-    decoder.close()
 
 
-@pytest.mark.skipif(
-    platform.system() not in ("Darwin", "Linux"),
-    reason="VP8 は macOS / Linux のみサポート",
-)
-def test_vp8_encode_scaling_same_resolution():
-    """VP8 configure と同じ解像度のフレームはスケーリングなしでエンコードされることを確認."""
-    width, height = 320, 240
-
-    encoded_chunks = []
-
-    def on_output(chunk):
-        encoded_chunks.append(chunk)
-
-    def on_error(error):
-        pytest.fail(f"Encoder error: {error}")
-
-    encoder = VideoEncoder(on_output, on_error)
-
-    config: VideoEncoderConfig = {
-        "codec": "vp8",
-        "width": width,
-        "height": height,
-        "bitrate": 500_000,
-        "framerate": 30.0,
-        "latency_mode": LatencyMode.REALTIME,
-    }
-    encoder.configure(config)
-
-    # 同じ解像度のフレーム
-    frame = _make_test_frame(width, height, 0)
-    encoder.encode(frame, {"key_frame": True})
-    encoder.flush()
-    frame.close()
-
-    # エンコードが成功していることを確認
-    assert len(encoded_chunks) >= 1
-    assert encoded_chunks[0].byte_length > 0
-
-    encoder.close()
-
-
-# =============================================================================
-# VP9 スケーリングテスト
-# =============================================================================
-
-
-@pytest.mark.skipif(
-    platform.system() not in ("Darwin", "Linux"),
-    reason="VP9 は macOS / Linux のみサポート",
-)
-def test_vp9_encode_with_scaling():
-    """VP9 エンコーダのスケーリング機能テスト."""
-    # configure: 320x240 (出力解像度)
-    output_width, output_height = 320, 240
-    # encode: 640x480 のフレーム (入力解像度)
-    input_width, input_height = 640, 480
-
-    encoded_chunks = []
-
-    def on_output(chunk):
-        encoded_chunks.append(chunk)
-
-    def on_error(error):
-        pytest.fail(f"Encoder error: {error}")
-
-    encoder = VideoEncoder(on_output, on_error)
-
-    config: VideoEncoderConfig = {
-        "codec": "vp09.00.10.08",
-        "width": output_width,
-        "height": output_height,
-        "bitrate": 500_000,
-        "framerate": 30.0,
-        "latency_mode": LatencyMode.REALTIME,
-    }
-    encoder.configure(config)
-
-    # 入力解像度のフレームを作成
-    frame = _make_test_frame(input_width, input_height, 0)
-    encoder.encode(frame, {"key_frame": True})
-    encoder.flush()
-    frame.close()
-
-    # エンコードが成功していることを確認
-    assert len(encoded_chunks) >= 1
-    assert encoded_chunks[0].byte_length > 0
-    assert encoded_chunks[0].type == EncodedVideoChunkType.KEY
-
-    # デコードして出力解像度を確認
-    decoded_frames = []
-
-    def on_decode_output(frame):
-        decoded_frames.append(frame)
-
-    def on_decode_error(error):
-        pytest.fail(f"Decoder error: {error}")
-
-    decoder = VideoDecoder(on_decode_output, on_decode_error)
-
-    decoder_config: VideoDecoderConfig = {"codec": "vp09.00.10.08"}
-    decoder.configure(decoder_config)
-
-    for chunk in encoded_chunks:
-        decoder.decode(chunk)
-    decoder.flush()
-
-    # デコードされたフレームが出力解像度になっていることを確認
-    assert len(decoded_frames) >= 1
-    for frame in decoded_frames:
-        assert frame.coded_width == output_width, (
-            f"出力幅が期待値と異なる: 期待値 {output_width}, 実際 {frame.coded_width}"
-        )
-        assert frame.coded_height == output_height, (
-            f"出力高さが期待値と異なる: 期待値 {output_height}, 実際 {frame.coded_height}"
-        )
-        frame.close()
-
-    encoder.close()
-    decoder.close()
-
-
-@pytest.mark.skipif(
-    platform.system() not in ("Darwin", "Linux"),
-    reason="VP9 は macOS / Linux のみサポート",
-)
-def test_vp9_encode_scaling_same_resolution():
-    """VP9 configure と同じ解像度のフレームはスケーリングなしでエンコードされることを確認."""
-    width, height = 320, 240
-
-    encoded_chunks = []
-
-    def on_output(chunk):
-        encoded_chunks.append(chunk)
-
-    def on_error(error):
-        pytest.fail(f"Encoder error: {error}")
-
-    encoder = VideoEncoder(on_output, on_error)
-
-    config: VideoEncoderConfig = {
-        "codec": "vp09.00.10.08",
-        "width": width,
-        "height": height,
-        "bitrate": 500_000,
-        "framerate": 30.0,
-        "latency_mode": LatencyMode.REALTIME,
-    }
-    encoder.configure(config)
-
-    # 同じ解像度のフレーム
-    frame = _make_test_frame(width, height, 0)
-    encoder.encode(frame, {"key_frame": True})
-    encoder.flush()
-    frame.close()
-
-    # エンコードが成功していることを確認
-    assert len(encoded_chunks) >= 1
-    assert encoded_chunks[0].byte_length > 0
-
-    encoder.close()
-
-
-@pytest.mark.skipif(
-    platform.system() not in ("Darwin", "Linux"),
-    reason="VP9 は macOS / Linux のみサポート",
-)
-def test_vp9_encode_scaling_multiple_frames():
-    """VP9 複数フレームでのスケーリングテスト."""
+@pytest.mark.parametrize("codec", CODECS)
+@pytest.mark.parametrize("pixel_format", VideoPixelFormat)
+def test_encode_scaling_multiple_frames(codec: str, pixel_format: VideoPixelFormat):
+    """複数フレームでのスケーリングテスト (各コーデック・各ピクセルフォーマット)."""
     # configure: 320x240 (出力解像度)
     output_width, output_height = 320, 240
     # encode: 640x480 のフレーム (入力解像度)
@@ -461,7 +220,7 @@ def on_error(error):
     encoder = VideoEncoder(on_output, on_error)
 
     config: VideoEncoderConfig = {
-        "codec": "vp09.00.10.08",
+        "codec": codec,
         "width": output_width,
         "height": output_height,
         "bitrate": 500_000,
@@ -472,7 +231,7 @@ def on_error(error):
 
     # 入力解像度のフレームを複数作成・エンコード
     for i in range(num_frames):
-        frame = _make_test_frame(input_width, input_height, i)
+        frame = _make_test_frame(input_width, input_height, i, pixel_format)
         encoder.encode(frame, {"key_frame": i == 0})
         frame.close()