diff --git a/kernel/src/drivers/usb/xhci.rs b/kernel/src/drivers/usb/xhci.rs
index ebb33941..c1cf7665 100644
--- a/kernel/src/drivers/usb/xhci.rs
+++ b/kernel/src/drivers/usb/xhci.rs
@@ -466,6 +466,9 @@ pub static DIAG_KBD_PORTSC: AtomicU32 = AtomicU32::new(0);
 pub static DIAG_KBD_EP_STATE: AtomicU32 = AtomicU32::new(0);
 /// Periodic diagnostic: SPI enable count (how many times SPI was re-enabled).
 pub static DIAG_SPI_ENABLE_COUNT: AtomicU64 = AtomicU64::new(0);
+/// Set once after the first deferred SPI activation; handle_interrupt
+/// keeps SPI alive after that, so poll_hid_events doesn't re-enable.
+static SPI_ACTIVATED: AtomicBool = AtomicBool::new(false);
 /// Diagnostic counter for doorbell/transfer events (shown as `db=` in heartbeat).
 pub static DIAG_DOORBELL_EP_STATE: AtomicU32 = AtomicU32::new(0);
 /// Diagnostic: last CC received for any GET_REPORT Transfer Event (0xFF = none seen yet).
@@ -4777,9 +4780,9 @@ pub fn init(pci_dev: &crate::drivers::pci::Device) -> Result<(), &'static str> {
 /// Handle an XHCI interrupt.
 ///
 /// Called from the GIC interrupt handler when the XHCI IRQ fires.
-/// Immediately disables the GIC SPI to prevent re-delivery storms,
-/// then processes all pending events. The SPI is re-enabled by
-/// poll_hid_events() on the next timer tick (~5ms later).
+/// Disables the GIC SPI while processing to prevent re-delivery during
+/// IMAN/ERDP acknowledgment, then re-enables it before returning so the
+/// next event gets a real interrupt with no polling delay.
 pub fn handle_interrupt() {
     if !XHCI_INITIALIZED.load(Ordering::Acquire) {
         // SPI should not be enabled during init (it's deferred until
@@ -5012,6 +5015,17 @@ pub fn handle_interrupt() {
         }
     }
 
+    // Re-enable the GIC SPI now that we've drained the event ring.
+    // Any MSI generated by the IMAN/ERDP writes above will fire as a
+    // new interrupt after we return.  That second invocation will find
+    // an empty ring (IP=0, no cycle-bit match) and return quickly —
+    // no storm, because we only write IMAN/USBSTS when their bits are
+    // actually set.
+    if state.irq != 0 {
+        crate::arch_impl::aarch64::gic::clear_spi_pending(state.irq);
+        crate::arch_impl::aarch64::gic::enable_spi(state.irq);
+    }
+
 }
 
 // =============================================================================
@@ -5186,10 +5200,14 @@ fn deferred_queue_trbs(state: &XhciState) {
     }
 }
 
+/// Timer-tick housekeeping for xHCI.
 ///
-/// Called from the timer interrupt at ~200 Hz. Uses `try_lock()` to avoid
-/// deadlocking if the lock is held by non-interrupt code. Bypasses the
-/// IMAN.IP check since that may not be set without a wired interrupt line.
+/// Called from the timer interrupt at 200 Hz (every 5ms). Handles:
+/// - One-time deferred SPI activation (first 250ms after init)
+/// - Endpoint reset recovery for CC=12 errors
+/// - Doorbell re-ring after SPI activation
+/// - Draining any events the MSI handler missed (safety net only;
+///   the primary event path is handle_interrupt)
 pub fn poll_hid_events() {
     if !XHCI_INITIALIZED.load(Ordering::Acquire) {
         return;
@@ -5197,13 +5215,6 @@ pub fn poll_hid_events() {
 
     POLL_COUNT.fetch_add(1, Ordering::Relaxed);
 
-    // Rate-limit: poll every 4th tick (~50 Hz at 200 Hz timer).
-    // Balances responsiveness (20ms latency) with hypervisor overhead.
-    let poll = POLL_COUNT.load(Ordering::Relaxed);
-    if poll % 4 != 0 {
-        return;
-    }
-
     // try_lock: if someone else holds the lock, skip this poll cycle
     let _guard = match XHCI_LOCK.try_lock() {
         Some(g) => g,
@@ -5558,29 +5569,26 @@ pub fn poll_hid_events() {
     }
 
     // Deferred MSI activation.
-    // SPI is enabled after a stabilization period (200 polls = 1 second)
-    // to avoid interfering with init.
-    if state.irq != 0 && poll >= 200 {
-        // Enable SPI for MSI delivery (handle_interrupt disables on each fire)
+    // Enable SPI after a short stabilization period (50 polls = 250ms)
+    // so xHCI init completes before interrupts start firing.
+    // Once enabled, handle_interrupt() re-enables SPI after each invocation,
+    // so this only matters for the very first activation.
+    if state.irq != 0 && poll >= 50 && !SPI_ACTIVATED.load(Ordering::Relaxed) {
+        SPI_ACTIVATED.store(true, Ordering::Release);
         crate::arch_impl::aarch64::gic::clear_spi_pending(state.irq);
         crate::arch_impl::aarch64::gic::enable_spi(state.irq);
         DIAG_SPI_ENABLE_COUNT.fetch_add(1, Ordering::Relaxed);
     }
 
     // Ensure HID_TRBS_QUEUED is set after initialization completes.
-    if poll >= 250 && !HID_TRBS_QUEUED.load(Ordering::Acquire) {
+    if poll >= 100 && !HID_TRBS_QUEUED.load(Ordering::Acquire) {
         HID_TRBS_QUEUED.store(true, Ordering::Release);
     }
 
-    // Re-ring doorbells after SPI activation (poll=300, ~1.5s after timer starts).
-    //
-    // The Parallels vxHC may not process interrupt endpoint TRBs until the MSI/SPI
-    // interrupt path is active. TRBs were queued and doorbells rung during init
-    // (before the timer started), but the SPI wasn't enabled until poll=200.
-    // Re-ringing doorbells after SPI activation tells the xHC to re-check the
-    // transfer rings now that the interrupt delivery path is ready.
+    // Re-ring doorbells shortly after SPI activation (poll=75, ~375ms).
+    // Tells the xHC to re-check transfer rings now that the interrupt path is live.
     static DOORBELLS_RE_RUNG: AtomicBool = AtomicBool::new(false);
-    if poll == 300 && !DOORBELLS_RE_RUNG.load(Ordering::Acquire) {
+    if poll == 75 && !DOORBELLS_RE_RUNG.load(Ordering::Acquire) {
         DOORBELLS_RE_RUNG.store(true, Ordering::Release);
         // Mouse EP3
         if state.mouse_slot != 0 && state.mouse_endpoint != 0 {
diff --git a/kernel/src/drivers/virtio/gpu_pci.rs b/kernel/src/drivers/virtio/gpu_pci.rs
index 4aae2b9f..26247643 100644
--- a/kernel/src/drivers/virtio/gpu_pci.rs
+++ b/kernel/src/drivers/virtio/gpu_pci.rs
@@ -233,10 +233,9 @@ static mut PCI_CMD_BUF: PciCmdBuffer = PciCmdBuffer { data: [0; 512] };
 static mut PCI_RESP_BUF: PciCmdBuffer = PciCmdBuffer { data: [0; 512] };
 
 // Default framebuffer dimensions (Parallels: set_scanout configures display mode)
-// 2560x1600 is the max that fits in the ~16MB GOP BAR0 region on Parallels.
-// On a Retina Mac, Parallels 2x-scales this to ~1280x800 window points.
-const DEFAULT_FB_WIDTH: u32 = 2560;
-const DEFAULT_FB_HEIGHT: u32 = 1600;
+// 1728x1080 matches the QEMU resolution for consistent performance comparison.
+const DEFAULT_FB_WIDTH: u32 = 1728;
+const DEFAULT_FB_HEIGHT: u32 = 1080;
 // Max supported resolution: 2560x1600 @ 32bpp = ~16.4MB
 const FB_MAX_WIDTH: u32 = 2560;
 const FB_MAX_HEIGHT: u32 = 1600;
@@ -380,15 +379,20 @@ pub fn init() -> Result<(), &'static str> {
     // If create_resource/attach_backing/set_scanout/flush time out, leaving
     // the flag true would mislead other code into thinking the device is usable.
 
-    // Query display info (ignore result — we override to our desired resolution).
-    let _ = get_display_info();
+    // Query display info to see what Parallels reports as native resolution.
+    let display_dims = get_display_info();
+    match display_dims {
+        Ok((dw, dh)) => crate::serial_println!("[virtio-gpu-pci] Display reports: {}x{}", dw, dh),
+        Err(e) => crate::serial_println!("[virtio-gpu-pci] GET_DISPLAY_INFO failed: {}", e),
+    }
 
-    // Override to our desired resolution.
-    // On Parallels, VirtIO GPU set_scanout controls the display MODE (stride,
-    // resolution) but actual pixels are read from BAR0 (the GOP address at
-    // 0x10000000). We use VirtIO GPU purely to configure a higher resolution
-    // than the GOP-reported 1024x768.
-    let (use_width, use_height) = (DEFAULT_FB_WIDTH, DEFAULT_FB_HEIGHT);
+    // Use the display-reported resolution if it's reasonable, otherwise
+    // fall back to our default. This respects the actual Parallels display
+    // mode instead of forcing a resolution that may be ignored.
+    let (use_width, use_height) = match display_dims {
+        Ok((w, h)) if w >= 640 && h >= 480 && w <= FB_MAX_WIDTH && h <= FB_MAX_HEIGHT => (w, h),
+        _ => (DEFAULT_FB_WIDTH, DEFAULT_FB_HEIGHT),
+    };
 
     // Update state with actual dimensions
     unsafe {
@@ -774,6 +778,19 @@ pub fn flush_rect(x: u32, y: u32, width: u32, height: u32) -> Result<(), &'stati
     })
 }
 
+/// Send only a RESOURCE_FLUSH command without TRANSFER_TO_HOST_2D.
+///
+/// Used in GOP hybrid mode where pixels are already in BAR0 (the display
+/// scanout memory). The RESOURCE_FLUSH tells Parallels which region changed
+/// so it can update the host window, without the overhead of a DMA transfer
+/// from PCI_FRAMEBUFFER (which isn't used in hybrid mode).
+pub fn resource_flush_only(x: u32, y: u32, width: u32, height: u32) -> Result<(), &'static str> {
+    with_device_state(|state| {
+        fence(Ordering::SeqCst);
+        resource_flush_cmd(state, x, y, width, height)
+    })
+}
+
 /// Get the framebuffer dimensions.
 pub fn dimensions() -> Option<(u32, u32)> {
     unsafe {
diff --git a/kernel/src/fs/procfs/mod.rs b/kernel/src/fs/procfs/mod.rs
index cbb28975..a08fd0a0 100644
--- a/kernel/src/fs/procfs/mod.rs
+++ b/kernel/src/fs/procfs/mod.rs
@@ -42,6 +42,7 @@ use alloc::vec::Vec;
 use spin::Mutex;
 
 mod trace;
+#[cfg(target_arch = "aarch64")]
 mod xhci;
 
 /// Procfs entry types
@@ -451,7 +452,10 @@ pub fn read_entry(entry_type: ProcEntryType) -> Result<String, i32> {
             let entries = list_xhci_entries();
             Ok(entries.join("\n") + "\n")
         }
+        #[cfg(target_arch = "aarch64")]
         ProcEntryType::XhciTrace => Ok(xhci::generate_xhci_trace()),
+        #[cfg(not(target_arch = "aarch64"))]
+        ProcEntryType::XhciTrace => Ok(String::from("")),
         ProcEntryType::BreenixDir => {
             // Directory listing
             Ok(String::from("testing\n"))
diff --git a/kernel/src/graphics/arm64_fb.rs b/kernel/src/graphics/arm64_fb.rs
index 8e13831e..3f26cbb4 100644
--- a/kernel/src/graphics/arm64_fb.rs
+++ b/kernel/src/graphics/arm64_fb.rs
@@ -128,7 +128,10 @@ pub fn take_dirty_rect() -> Option<(u32, u32, u32, u32)> {
 /// For VirtIO GPU, this issues transfer_to_host + resource_flush commands.
 pub fn flush_dirty_rect(x: u32, y: u32, w: u32, h: u32) -> Result<(), &'static str> {
     if is_gop_active() {
-        // GOP: writes go directly to display memory. DSB ensures visibility.
+        // GOP: pixels are in BAR0 (display scanout memory). DSB ensures the
+        // CPU's write buffer is drained so stores are visible to the display
+        // controller. Parallels scans BAR0 at its own refresh rate — no VirtIO
+        // RESOURCE_FLUSH needed (it's synchronous and would add 10-50ms).
         unsafe { core::arch::asm!("dsb sy", options(nostack, preserves_flags)); }
         Ok(())
     } else if crate::drivers::virtio::gpu_pci::is_initialized() {
@@ -180,7 +183,7 @@ pub fn is_gop_active() -> bool {
 
 /// Get the GOP framebuffer as a mutable byte slice.
 /// Returns None if GOP is not initialized.
-fn gop_framebuffer() -> Option<&'static mut [u8]> {
+pub fn gop_framebuffer() -> Option<&'static mut [u8]> {
     let ptr = GOP_FB_PTR.load(Ordering::Relaxed);
     let len = GOP_FB_LEN.load(Ordering::Relaxed);
     if ptr == 0 || len == 0 {
@@ -242,7 +245,7 @@ pub fn init_gop_framebuffer() -> Result<(), &'static str> {
     };
 
     // Initialize SHELL_FRAMEBUFFER
-    let shell_fb = ShellFrameBuffer { fb };
+    let shell_fb = ShellFrameBuffer { fb, double_buffer: None };
 
     // Cache immutable dimensions for lock-free access by sys_fbinfo
     let _ = FB_INFO_CACHE.try_init_once(|| FbInfoCache {
@@ -315,7 +318,7 @@ pub fn init_gpu_pci_gop_framebuffer() -> Result<(), &'static str> {
         is_bgr_flag: true, // B8G8R8A8_UNORM
     };
 
-    let shell_fb = ShellFrameBuffer { fb };
+    let shell_fb = ShellFrameBuffer { fb, double_buffer: None };
 
     let _ = FB_INFO_CACHE.try_init_once(|| FbInfoCache {
         width,
@@ -603,6 +606,8 @@ pub fn clear_screen(color: Color) -> Result<(), &'static str> {
 pub struct ShellFrameBuffer {
     /// The underlying framebuffer
     fb: Arm64FrameBuffer,
+    /// Double buffer: shadow buffer in cached RAM, flushed to hardware (GOP BAR0)
+    double_buffer: Option<super::double_buffer::DoubleBufferedFrameBuffer>,
 }
 
 impl ShellFrameBuffer {
@@ -610,6 +615,7 @@ impl ShellFrameBuffer {
     pub fn new() -> Option<Self> {
         Some(Self {
             fb: Arm64FrameBuffer::new()?,
+            double_buffer: None,
         })
     }
 
@@ -644,14 +650,60 @@ impl ShellFrameBuffer {
         self.fb.flush()
     }
 
-    /// Get double buffer (returns None on ARM64)
-    ///
-    /// On ARM64, the VirtIO GPU handles buffering, so we don't need
-    /// a software double buffer. This method exists for API compatibility.
-    #[allow(dead_code)]
+    /// Get mutable access to the double buffer, if available.
     pub fn double_buffer_mut(&mut self) -> Option<&mut super::double_buffer::DoubleBufferedFrameBuffer> {
-        // ARM64 VirtIO GPU handles buffering internally
-        None
+        self.double_buffer.as_mut()
+    }
+
+    /// Upgrade to double-buffered rendering.
+    ///
+    /// Allocates a shadow buffer in cached heap RAM. All pixel writes go to
+    /// the shadow buffer (fast ~1ns/write), and `flush_if_dirty()` copies
+    /// dirty regions to the hardware framebuffer (GOP BAR0). This is critical
+    /// for Parallels performance where BAR0 writes are ~100ns each.
+    ///
+    /// Must be called after heap initialization.
+    pub fn upgrade_to_double_buffer(&mut self) {
+        if self.double_buffer.is_some() {
+            return;
+        }
+
+        // Only useful for GOP mode where writes go to slow device memory
+        if !self.fb.is_gop {
+            return;
+        }
+
+        let hw_ptr = GOP_FB_PTR.load(Ordering::Relaxed);
+        let hw_len = GOP_FB_LEN.load(Ordering::Relaxed) as usize;
+        if hw_ptr == 0 || hw_len == 0 {
+            return;
+        }
+
+        let stride_bytes = self.fb.stride * self.fb.bytes_per_pixel;
+        let db = super::double_buffer::DoubleBufferedFrameBuffer::new(
+            hw_ptr as *mut u8,
+            hw_len,
+            stride_bytes,
+            self.fb.height,
+        );
+
+        // Copy current hardware buffer content to shadow buffer so existing
+        // screen content (split-screen layout, text) is preserved.
+        self.double_buffer = Some(db);
+
+        if let Some(ref mut db) = self.double_buffer {
+            let shadow = db.buffer_mut();
+            let src = hw_ptr as *const u8;
+            let copy_len = shadow.len().min(hw_len);
+            unsafe {
+                core::ptr::copy_nonoverlapping(src, shadow.as_mut_ptr(), copy_len);
+            }
+        }
+
+        crate::serial_println!(
+            "[arm64-fb] Upgraded to double buffering: {}x{} shadow buffer ({} KB)",
+            self.fb.width, self.fb.height, hw_len / 1024
+        );
     }
 }
 
@@ -677,19 +729,83 @@ impl Canvas for ShellFrameBuffer {
     }
 
     fn set_pixel(&mut self, x: i32, y: i32, color: Color) {
-        self.fb.set_pixel(x, y, color);
+        if let Some(ref mut db) = self.double_buffer {
+            // Write to shadow buffer (fast cached RAM)
+            if x < 0 || y < 0 {
+                return;
+            }
+            let x = x as usize;
+            let y = y as usize;
+            if x >= self.fb.width || y >= self.fb.height {
+                return;
+            }
+            let bpp = self.fb.bytes_per_pixel;
+            let pixel_bytes = color.to_pixel_bytes(bpp, self.fb.is_bgr_flag);
+            let offset = (y * self.fb.stride + x) * bpp;
+            let shadow = db.buffer_mut();
+            if offset + bpp <= shadow.len() {
+                shadow[offset..offset + bpp].copy_from_slice(&pixel_bytes[..bpp]);
+                db.mark_region_dirty(y, offset, offset + bpp);
+            }
+        } else {
+            self.fb.set_pixel(x, y, color);
+        }
     }
 
     fn get_pixel(&self, x: i32, y: i32) -> Option<Color> {
-        self.fb.get_pixel(x, y)
+        if let Some(ref db) = self.double_buffer {
+            // Read from shadow buffer (fast cached RAM)
+            if x < 0 || y < 0 {
+                return None;
+            }
+            let x = x as usize;
+            let y = y as usize;
+            if x >= self.fb.width || y >= self.fb.height {
+                return None;
+            }
+            let bpp = self.fb.bytes_per_pixel;
+            let offset = (y * self.fb.stride + x) * bpp;
+            let buffer = db.buffer();
+            if offset + bpp > buffer.len() {
+                return None;
+            }
+            Some(Color::from_pixel_bytes(
+                &buffer[offset..offset + bpp],
+                bpp,
+                self.fb.is_bgr_flag,
+            ))
+        } else {
+            self.fb.get_pixel(x, y)
+        }
     }
 
     fn buffer_mut(&mut self) -> &mut [u8] {
-        self.fb.buffer_mut()
+        if let Some(ref mut db) = self.double_buffer {
+            db.buffer_mut()
+        } else {
+            self.fb.buffer_mut()
+        }
     }
 
     fn buffer(&self) -> &[u8] {
-        self.fb.buffer()
+        if let Some(ref db) = self.double_buffer {
+            db.buffer()
+        } else {
+            self.fb.buffer()
+        }
+    }
+
+    fn mark_dirty_region(&mut self, x: usize, y: usize, width: usize, height: usize) {
+        if let Some(ref mut db) = self.double_buffer {
+            let bpp = self.fb.bytes_per_pixel;
+            let stride_bytes = self.fb.stride * bpp;
+            let x_start = (x * bpp).min(stride_bytes);
+            let x_end = ((x + width) * bpp).min(stride_bytes);
+            let y_end = (y + height).min(self.fb.height);
+            db.mark_region_dirty_rect(y, y_end, x_start, x_end);
+        }
+        // Also mark the atomic dirty rect for VirtIO GPU flush hint
+        mark_dirty(x as u32, y as u32, width as u32, height as u32);
     }
 }
 
@@ -737,6 +853,17 @@ pub fn init_shell_framebuffer() -> Result<(), &'static str> {
     Ok(())
 }
 
+/// Upgrade the shell framebuffer to double-buffered rendering.
+///
+/// Allocates a shadow buffer in cached heap RAM. Must be called after
+/// heap initialization. Safe to call multiple times — only upgrades once.
+pub fn upgrade_to_double_buffer() {
+    if let Some(fb) = SHELL_FRAMEBUFFER.get() {
+        let mut guard = fb.lock();
+        guard.upgrade_to_double_buffer();
+    }
+}
+
 /// Get the framebuffer dimensions
 pub fn dimensions() -> Option<(usize, usize)> {
     SHELL_FRAMEBUFFER.get().and_then(|fb| {
diff --git a/kernel/src/graphics/font.rs b/kernel/src/graphics/font.rs
index 04d73527..fa8b9633 100644
--- a/kernel/src/graphics/font.rs
+++ b/kernel/src/graphics/font.rs
@@ -155,6 +155,11 @@ impl Glyph {
         self.rasterized.height()
     }
 
+    /// Get the raw raster data (rows of intensity bytes).
+    pub fn raster(&self) -> &[&[u8]] {
+        self.rasterized.raster()
+    }
+
     /// Iterate over the glyph pixels with coordinates and intensity.
     /// Yields (x, y, intensity) for each pixel.
     pub fn pixels(&self) -> impl Iterator<Item = (usize, usize, u8)> + '_ {
diff --git a/kernel/src/graphics/primitives.rs b/kernel/src/graphics/primitives.rs
index 97827bba..8a665591 100644
--- a/kernel/src/graphics/primitives.rs
+++ b/kernel/src/graphics/primitives.rs
@@ -495,6 +495,17 @@ pub fn draw_char(canvas: &mut impl Canvas, x: i32, y: i32, c: char, style: &Text
 
 /// Draw a glyph at the specified position with the given style.
 fn draw_glyph(canvas: &mut impl Canvas, x: i32, y: i32, glyph: &Glyph, style: &TextStyle) {
+    // Fast path: glyph fully within canvas bounds — write directly to buffer
+    if x >= 0 && y >= 0
+        && (x as usize + glyph.width()) <= canvas.width()
+        && (y as usize + glyph.height()) <= canvas.height()
+    {
+        draw_glyph_direct(canvas, x as usize, y as usize, glyph, style);
+        canvas.mark_dirty_region(x as usize, y as usize, glyph.width(), glyph.height());
+        return;
+    }
+
+    // Slow path: per-pixel with bounds checking for edge cases
     for (gx, gy, intensity) in glyph.pixels() {
         if intensity == 0 {
             continue;
@@ -504,14 +515,11 @@ fn draw_glyph(canvas: &mut impl Canvas, x: i32, y: i32, glyph: &Glyph, style: &T
         let py = y + gy as i32;
 
         let color = if let Some(bg) = style.background {
-            // Explicit background - blend foreground with specified background
             blend_colors(style.foreground, bg, intensity)
         } else {
-            // No explicit background - blend with actual canvas pixel for proper anti-aliasing
             if let Some(existing) = canvas.get_pixel(px, py) {
                 blend_colors(style.foreground, existing, intensity)
             } else {
-                // Out of bounds, skip
                 continue;
             }
         };
@@ -519,12 +527,78 @@ fn draw_glyph(canvas: &mut impl Canvas, x: i32, y: i32, glyph: &Glyph, style: &T
         canvas.set_pixel(px, py, color);
     }
 
-    // Mark the entire glyph bounding box dirty once (not per-pixel)
     if x >= 0 && y >= 0 {
         canvas.mark_dirty_region(x as usize, y as usize, glyph.width(), glyph.height());
     }
 }
 
+/// Fast-path glyph rendering: writes directly to buffer_mut() without per-pixel
+/// bounds checks or function calls. The caller must ensure the glyph is fully
+/// within canvas bounds.
+fn draw_glyph_direct(canvas: &mut impl Canvas, x: usize, y: usize, glyph: &Glyph, style: &TextStyle) {
+    let bpp = canvas.bytes_per_pixel();
+    let stride = canvas.stride();
+    let is_bgr = canvas.is_bgr();
+    let stride_bytes = stride * bpp;
+    let glyph_w = glyph.width();
+
+    // Pre-compute foreground pixel bytes
+    let fg_bytes = style.foreground.to_pixel_bytes(bpp, is_bgr);
+
+    if let Some(bg) = style.background {
+        // With explicit background: pre-blend common intensity values
+        let bg_bytes = bg.to_pixel_bytes(bpp, is_bgr);
+        let buffer = canvas.buffer_mut();
+
+        for (gy, row) in glyph.raster().iter().enumerate() {
+            let row_offset = (y + gy) * stride_bytes + x * bpp;
+            if row_offset + glyph_w * bpp > buffer.len() {
+                break;
+            }
+
+            for (gx, &intensity) in row.iter().take(glyph_w).enumerate() {
+                let offset = row_offset + gx * bpp;
+                if intensity == 0 {
+                    buffer[offset..offset + bpp].copy_from_slice(&bg_bytes[..bpp]);
+                } else if intensity == 255 {
+                    buffer[offset..offset + bpp].copy_from_slice(&fg_bytes[..bpp]);
+                } else {
+                    let blended = blend_colors(style.foreground, bg, intensity);
+                    let px = blended.to_pixel_bytes(bpp, is_bgr);
+                    buffer[offset..offset + bpp].copy_from_slice(&px[..bpp]);
+                }
+            }
+        }
+    } else {
+        // No explicit background: read existing pixels, blend, write back.
+        // We read the background region first, then blend and write.
+        let buffer = canvas.buffer_mut();
+
+        for (gy, row) in glyph.raster().iter().enumerate() {
+            let row_offset = (y + gy) * stride_bytes + x * bpp;
+            if row_offset + glyph_w * bpp > buffer.len() {
+                break;
+            }
+
+            for (gx, &intensity) in row.iter().take(glyph_w).enumerate() {
+                if intensity == 0 {
+                    continue;
+                }
+                let offset = row_offset + gx * bpp;
+                if intensity == 255 {
+                    buffer[offset..offset + bpp].copy_from_slice(&fg_bytes[..bpp]);
+                } else {
+                    // Read existing pixel, blend
+                    let existing = Color::from_pixel_bytes(&buffer[offset..offset + bpp], bpp, is_bgr);
+                    let blended = blend_colors(style.foreground, existing, intensity);
+                    let px = blended.to_pixel_bytes(bpp, is_bgr);
+                    buffer[offset..offset + bpp].copy_from_slice(&px[..bpp]);
+                }
+            }
+        }
+    }
+}
+
 /// Draw a text string at the specified position.
 ///
 /// Handles newlines by moving to the next line.
diff --git a/kernel/src/graphics/render_task.rs b/kernel/src/graphics/render_task.rs
index e0981ece..02008f67 100644
--- a/kernel/src/graphics/render_task.rs
+++ b/kernel/src/graphics/render_task.rs
@@ -211,16 +211,20 @@ fn flush_framebuffer() -> bool {
     }
     #[cfg(target_arch = "aarch64")]
     {
-        // Only flush if pixels have changed. The dirty rect is set by:
-        //   - sys_fbdraw (syscall path, after fast pixel copies)
-        //   - particles thread (after rendering)
-        //   - cursor updates (above)
-        //   - render_queue/split_screen text rendering
-        //
+        // First, flush the double buffer (shadow → BAR0 copy) if present.
+        // This must happen while holding SHELL_FRAMEBUFFER so the shadow
+        // buffer isn't modified mid-copy.
+        if let Some(fb) = crate::graphics::arm64_fb::SHELL_FRAMEBUFFER.get() {
+            if let Some(mut fb_guard) = fb.try_lock() {
+                if let Some(db) = fb_guard.double_buffer_mut() {
+                    db.flush_if_dirty();
+                }
+            }
+        }
+
+        // Then flush dirty regions to the display (DSB + optional VirtIO hint).
         // No SHELL_FRAMEBUFFER lock needed here — we're not touching the pixel
-        // buffer, just submitting GPU commands via gpu_mmio. This eliminates the
-        // two-lock nesting (SHELL_FRAMEBUFFER + GPU_LOCK) that caused deadlocks
-        // when sys_fbdraw held SHELL_FRAMEBUFFER with IRQs disabled.
+        // buffer, just submitting GPU commands.
         if let Some((x, y, w, h)) = crate::graphics::arm64_fb::take_dirty_rect() {
             if let Err(e) = crate::graphics::arm64_fb::flush_dirty_rect(x, y, w, h) {
                 crate::serial_println!("[render] GPU flush failed: {}", e);
diff --git a/kernel/src/graphics/split_screen.rs b/kernel/src/graphics/split_screen.rs
index 81bd7d96..cadc87f7 100644
--- a/kernel/src/graphics/split_screen.rs
+++ b/kernel/src/graphics/split_screen.rs
@@ -238,8 +238,9 @@ pub fn write_char_to_terminal(c: char) -> bool {
                     if let Some(db) = fb_guard.double_buffer_mut() {
                         db.flush_if_dirty();
                     }
-                    #[cfg(target_arch = "aarch64")]
-                    super::arm64_fb::mark_full_dirty();
+                    // ARM64: dirty regions are tracked automatically via
+                    // Canvas::mark_dirty_region() in the primitives layer.
+                    // The render thread flushes dirty rects periodically.
 
                     return true;
                 }
@@ -266,8 +267,8 @@ pub fn write_str_to_terminal(s: &str) -> bool {
                     if let Some(db) = fb_guard.double_buffer_mut() {
                         db.flush_if_dirty();
                     }
-                    #[cfg(target_arch = "aarch64")]
-                    super::arm64_fb::mark_full_dirty();
+                    // ARM64: dirty regions are tracked automatically via
+                    // Canvas::mark_dirty_region() in the primitives layer.
 
                     return true;
                 }
@@ -291,8 +292,7 @@ pub fn toggle_terminal_cursor() {
                     if let Some(db) = fb_guard.double_buffer_mut() {
                         db.flush_if_dirty();
                     }
-                    #[cfg(target_arch = "aarch64")]
-                    super::arm64_fb::mark_full_dirty();
+                    // ARM64: dirty regions are tracked automatically.
                 }
             }
         }
diff --git a/kernel/src/main_aarch64.rs b/kernel/src/main_aarch64.rs
index 68712347..512f3835 100644
--- a/kernel/src/main_aarch64.rs
+++ b/kernel/src/main_aarch64.rs
@@ -567,6 +567,13 @@ pub extern "C" fn kernel_main(hw_config_ptr: u64) -> ! {
         false
     };
 
+    // Upgrade framebuffer to double buffering now that heap is available.
+    // This allocates a shadow buffer in cached RAM so pixel writes are fast
+    // (~1ns vs ~100ns for direct GOP BAR0 writes on Parallels).
+    if has_display {
+        kernel::graphics::arm64_fb::upgrade_to_double_buffer();
+    }
+
     // Initialize input devices (capability-based detection)
     if kernel::drivers::usb::xhci::is_initialized() {
         // USB HID keyboard/mouse via XHCI — already set up during drivers::init()
diff --git a/kernel/src/syscall/graphics.rs b/kernel/src/syscall/graphics.rs
index d8ea65e0..3e4f3f40 100644
--- a/kernel/src/syscall/graphics.rs
+++ b/kernel/src/syscall/graphics.rs
@@ -545,24 +545,63 @@ pub fn sys_fbdraw(cmd_ptr: u64) -> SyscallResult {
                         (0, mmap_info.height)
                     };
 
-                    // Copy dirty rows from user buffer → GPU framebuffer at correct x_offset.
-                    // ARM64 has no double buffer; writes go directly to VirtIO GPU memory.
                     let fb_stride_bytes = fb_guard.stride() * fb_guard.bytes_per_pixel();
                     let row_bytes = mmap_info.width * mmap_info.bpp;
                     let x_byte_offset = mmap_info.x_offset * mmap_info.bpp;
-                    let gpu_buf = fb_guard.buffer_mut();
-
-                    for y in y_start..y_end {
-                        let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride;
-                        let gpu_row_offset = y * fb_stride_bytes + x_byte_offset;
-
-                        if gpu_row_offset + row_bytes <= gpu_buf.len() {
-                            unsafe {
-                                core::ptr::copy_nonoverlapping(
-                                    user_row_ptr as *const u8,
-                                    gpu_buf[gpu_row_offset..].as_mut_ptr(),
-                                    row_bytes,
-                                );
+
+                    // For GOP mode with double buffer, copy user → BAR0 directly
+                    // (single copy) instead of user → shadow → BAR0 (double copy).
+                    // The NC memory mapping makes BAR0 writes fast via write-combining.
+                    // Also update the shadow buffer so terminal reads stay consistent.
+                    if crate::graphics::arm64_fb::is_gop_active() {
+                        if let Some(gop_buf) = crate::graphics::arm64_fb::gop_framebuffer() {
+                            for y in y_start..y_end {
+                                let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride;
+                                let target_row_offset = y * fb_stride_bytes + x_byte_offset;
+
+                                if target_row_offset + row_bytes <= gop_buf.len() {
+                                    unsafe {
+                                        core::ptr::copy_nonoverlapping(
+                                            user_row_ptr as *const u8,
+                                            gop_buf[target_row_offset..].as_mut_ptr(),
+                                            row_bytes,
+                                        );
+                                    }
+                                }
+                            }
+                            // Also update shadow buffer to keep it consistent
+                            if let Some(db) = fb_guard.double_buffer_mut() {
+                                let shadow = db.buffer_mut();
+                                for y in y_start..y_end {
+                                    let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride;
+                                    let target_row_offset = y * fb_stride_bytes + x_byte_offset;
+                                    if target_row_offset + row_bytes <= shadow.len() {
+                                        unsafe {
+                                            core::ptr::copy_nonoverlapping(
+                                                user_row_ptr as *const u8,
+                                                shadow[target_row_offset..].as_mut_ptr(),
+                                                row_bytes,
+                                            );
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    } else {
+                        // Non-GOP path: copy to GPU buffer (VirtIO MMIO/PCI framebuffer)
+                        let target_buf = fb_guard.buffer_mut();
+                        for y in y_start..y_end {
+                            let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride;
+                            let target_row_offset = y * fb_stride_bytes + x_byte_offset;
+
+                            if target_row_offset + row_bytes <= target_buf.len() {
+                                unsafe {
+                                    core::ptr::copy_nonoverlapping(
+                                        user_row_ptr as *const u8,
+                                        target_buf[target_row_offset..].as_mut_ptr(),
+                                        row_bytes,
+                                    );
+                                }
                             }
                         }
                     }
@@ -574,11 +613,10 @@ pub fn sys_fbdraw(cmd_ptr: u64) -> SyscallResult {
                 // for terminal text while we submit GPU commands.
                 drop(fb_guard);
 
-                // Synchronous GPU flush — submit transfer_to_host + resource_flush
-                // directly in the syscall instead of deferring to the render thread.
-                // This eliminates scheduling latency: bounce's frame is displayed
-                // immediately rather than waiting for the render thread to wake up
-                // (which could take 5ms+ due to timer tick granularity).
+                // Synchronous GPU flush — submit resource_flush (or transfer_to_host +
+                // resource_flush for non-GOP) directly in the syscall. This eliminates
+                // scheduling latency: bounce's frame is displayed immediately rather
+                // than waiting for the render thread (5ms+ due to timer tick).
                 if let Some((fx, fy, fw, fh)) = flush_rect {
                     let _ = crate::graphics::arm64_fb::flush_dirty_rect(fx, fy, fw, fh);
                 }
diff --git a/parallels-loader/src/page_tables.rs b/parallels-loader/src/page_tables.rs
index 5e288bb1..8ba87522 100644
--- a/parallels-loader/src/page_tables.rs
+++ b/parallels-loader/src/page_tables.rs
@@ -177,13 +177,23 @@ pub fn build_page_tables(storage: &mut PageTableStorage) -> (u64, u64) {
     // TTBR1 L1[0] -> L2 (HHDM + 0x00000000 - 0x3FFFFFFF)
     write_entry(ttbr1_l1, 0, ttbr1_l2_dev | attr::TABLE_DESC);
 
-    // Map all 2MB blocks in 0x00000000-0x3FFFFFFF as device memory
+    // Map all 2MB blocks in 0x00000000-0x3FFFFFFF as device memory,
+    // except for the GOP BAR0 framebuffer region (0x10000000-0x10FFFFFF,
+    // L2 indices 128-135) which uses Normal-NC for write-combining.
+    // Write-combining allows the CPU to coalesce/reorder stores, making
+    // bulk memcpy from shadow buffer to BAR0 significantly faster while
+    // maintaining coherency with the hypervisor's display scanout.
     // This covers: GIC (0x02010000), UART (0x02110000), PCI ECAM (0x02300000),
     // GICR (0x02500000), PCI MMIO (0x10000000-0x1FFFFFFF)
     for i in 0..512u64 {
         let phys = i * 0x20_0000; // 2MB blocks
-        write_entry(ttbr0_l2_dev, i as usize, phys | attr::DEVICE_BLOCK);
-        write_entry(ttbr1_l2_dev, i as usize, phys | attr::DEVICE_BLOCK);
+        let block_attr = if (128..136).contains(&i) {
+            attr::NC_BLOCK // GOP BAR0: write-combining for framebuffer
+        } else {
+            attr::DEVICE_BLOCK
+        };
+        write_entry(ttbr0_l2_dev, i as usize, phys | block_attr);
+        write_entry(ttbr1_l2_dev, i as usize, phys | block_attr);
     }
 
     // --- RAM: 0x40000000 - 0xBFFFFFFF (2GB, L1 entries 1-2) ---