diff --git a/kernel/src/drivers/usb/xhci.rs b/kernel/src/drivers/usb/xhci.rs index ebb33941..c1cf7665 100644 --- a/kernel/src/drivers/usb/xhci.rs +++ b/kernel/src/drivers/usb/xhci.rs @@ -466,6 +466,9 @@ pub static DIAG_KBD_PORTSC: AtomicU32 = AtomicU32::new(0); pub static DIAG_KBD_EP_STATE: AtomicU32 = AtomicU32::new(0); /// Periodic diagnostic: SPI enable count (how many times SPI was re-enabled). pub static DIAG_SPI_ENABLE_COUNT: AtomicU64 = AtomicU64::new(0); +/// Set once after the first deferred SPI activation; handle_interrupt +/// keeps SPI alive after that, so poll_hid_events doesn't re-enable. +static SPI_ACTIVATED: AtomicBool = AtomicBool::new(false); /// Diagnostic counter for doorbell/transfer events (shown as `db=` in heartbeat). pub static DIAG_DOORBELL_EP_STATE: AtomicU32 = AtomicU32::new(0); /// Diagnostic: last CC received for any GET_REPORT Transfer Event (0xFF = none seen yet). @@ -4777,9 +4780,9 @@ pub fn init(pci_dev: &crate::drivers::pci::Device) -> Result<(), &'static str> { /// Handle an XHCI interrupt. /// /// Called from the GIC interrupt handler when the XHCI IRQ fires. -/// Immediately disables the GIC SPI to prevent re-delivery storms, -/// then processes all pending events. The SPI is re-enabled by -/// poll_hid_events() on the next timer tick (~5ms later). +/// Disables the GIC SPI while processing to prevent re-delivery during +/// IMAN/ERDP acknowledgment, then re-enables it before returning so the +/// next event gets a real interrupt with no polling delay. pub fn handle_interrupt() { if !XHCI_INITIALIZED.load(Ordering::Acquire) { // SPI should not be enabled during init (it's deferred until @@ -5012,6 +5015,17 @@ pub fn handle_interrupt() { } } + // Re-enable the GIC SPI now that we've drained the event ring. + // Any MSI generated by the IMAN/ERDP writes above will fire as a + // new interrupt after we return. That second invocation will find + // an empty ring (IP=0, no cycle-bit match) and return quickly — + // no storm, because we only write IMAN/USBSTS when their bits are + // actually set. + if state.irq != 0 { + crate::arch_impl::aarch64::gic::clear_spi_pending(state.irq); + crate::arch_impl::aarch64::gic::enable_spi(state.irq); + } + } // ============================================================================= @@ -5186,10 +5200,14 @@ fn deferred_queue_trbs(state: &XhciState) { } } +/// Timer-tick housekeeping for xHCI. /// -/// Called from the timer interrupt at ~200 Hz. Uses `try_lock()` to avoid -/// deadlocking if the lock is held by non-interrupt code. Bypasses the -/// IMAN.IP check since that may not be set without a wired interrupt line. +/// Called from the timer interrupt at 200 Hz (every 5ms). Handles: +/// - One-time deferred SPI activation (first 250ms after init) +/// - Endpoint reset recovery for CC=12 errors +/// - Doorbell re-ring after SPI activation +/// - Draining any events the MSI handler missed (safety net only; +/// the primary event path is handle_interrupt) pub fn poll_hid_events() { if !XHCI_INITIALIZED.load(Ordering::Acquire) { return; @@ -5197,13 +5215,6 @@ pub fn poll_hid_events() { POLL_COUNT.fetch_add(1, Ordering::Relaxed); - // Rate-limit: poll every 4th tick (~50 Hz at 200 Hz timer). - // Balances responsiveness (20ms latency) with hypervisor overhead. - let poll = POLL_COUNT.load(Ordering::Relaxed); - if poll % 4 != 0 { - return; - } - // try_lock: if someone else holds the lock, skip this poll cycle let _guard = match XHCI_LOCK.try_lock() { Some(g) => g, @@ -5558,29 +5569,26 @@ pub fn poll_hid_events() { } // Deferred MSI activation. - // SPI is enabled after a stabilization period (200 polls = 1 second) - // to avoid interfering with init. - if state.irq != 0 && poll >= 200 { - // Enable SPI for MSI delivery (handle_interrupt disables on each fire) + // Enable SPI after a short stabilization period (50 polls = 250ms) + // so xHCI init completes before interrupts start firing. + // Once enabled, handle_interrupt() re-enables SPI after each invocation, + // so this only matters for the very first activation. + if state.irq != 0 && poll >= 50 && !SPI_ACTIVATED.load(Ordering::Relaxed) { + SPI_ACTIVATED.store(true, Ordering::Release); crate::arch_impl::aarch64::gic::clear_spi_pending(state.irq); crate::arch_impl::aarch64::gic::enable_spi(state.irq); DIAG_SPI_ENABLE_COUNT.fetch_add(1, Ordering::Relaxed); } // Ensure HID_TRBS_QUEUED is set after initialization completes. - if poll >= 250 && !HID_TRBS_QUEUED.load(Ordering::Acquire) { + if poll >= 100 && !HID_TRBS_QUEUED.load(Ordering::Acquire) { HID_TRBS_QUEUED.store(true, Ordering::Release); } - // Re-ring doorbells after SPI activation (poll=300, ~1.5s after timer starts). - // - // The Parallels vxHC may not process interrupt endpoint TRBs until the MSI/SPI - // interrupt path is active. TRBs were queued and doorbells rung during init - // (before the timer started), but the SPI wasn't enabled until poll=200. - // Re-ringing doorbells after SPI activation tells the xHC to re-check the - // transfer rings now that the interrupt delivery path is ready. + // Re-ring doorbells shortly after SPI activation (poll=75, ~375ms). + // Tells the xHC to re-check transfer rings now that the interrupt path is live. static DOORBELLS_RE_RUNG: AtomicBool = AtomicBool::new(false); - if poll == 300 && !DOORBELLS_RE_RUNG.load(Ordering::Acquire) { + if poll == 75 && !DOORBELLS_RE_RUNG.load(Ordering::Acquire) { DOORBELLS_RE_RUNG.store(true, Ordering::Release); // Mouse EP3 if state.mouse_slot != 0 && state.mouse_endpoint != 0 { diff --git a/kernel/src/drivers/virtio/gpu_pci.rs b/kernel/src/drivers/virtio/gpu_pci.rs index 4aae2b9f..26247643 100644 --- a/kernel/src/drivers/virtio/gpu_pci.rs +++ b/kernel/src/drivers/virtio/gpu_pci.rs @@ -233,10 +233,9 @@ static mut PCI_CMD_BUF: PciCmdBuffer = PciCmdBuffer { data: [0; 512] }; static mut PCI_RESP_BUF: PciCmdBuffer = PciCmdBuffer { data: [0; 512] }; // Default framebuffer dimensions (Parallels: set_scanout configures display mode) -// 2560x1600 is the max that fits in the ~16MB GOP BAR0 region on Parallels. -// On a Retina Mac, Parallels 2x-scales this to ~1280x800 window points. -const DEFAULT_FB_WIDTH: u32 = 2560; -const DEFAULT_FB_HEIGHT: u32 = 1600; +// 1728x1080 matches the QEMU resolution for consistent performance comparison. +const DEFAULT_FB_WIDTH: u32 = 1728; +const DEFAULT_FB_HEIGHT: u32 = 1080; // Max supported resolution: 2560x1600 @ 32bpp = ~16.4MB const FB_MAX_WIDTH: u32 = 2560; const FB_MAX_HEIGHT: u32 = 1600; @@ -380,15 +379,20 @@ pub fn init() -> Result<(), &'static str> { // If create_resource/attach_backing/set_scanout/flush time out, leaving // the flag true would mislead other code into thinking the device is usable. - // Query display info (ignore result — we override to our desired resolution). - let _ = get_display_info(); + // Query display info to see what Parallels reports as native resolution. + let display_dims = get_display_info(); + match display_dims { + Ok((dw, dh)) => crate::serial_println!("[virtio-gpu-pci] Display reports: {}x{}", dw, dh), + Err(e) => crate::serial_println!("[virtio-gpu-pci] GET_DISPLAY_INFO failed: {}", e), + } - // Override to our desired resolution. - // On Parallels, VirtIO GPU set_scanout controls the display MODE (stride, - // resolution) but actual pixels are read from BAR0 (the GOP address at - // 0x10000000). We use VirtIO GPU purely to configure a higher resolution - // than the GOP-reported 1024x768. - let (use_width, use_height) = (DEFAULT_FB_WIDTH, DEFAULT_FB_HEIGHT); + // Use the display-reported resolution if it's reasonable, otherwise + // fall back to our default. This respects the actual Parallels display + // mode instead of forcing a resolution that may be ignored. + let (use_width, use_height) = match display_dims { + Ok((w, h)) if w >= 640 && h >= 480 && w <= FB_MAX_WIDTH && h <= FB_MAX_HEIGHT => (w, h), + _ => (DEFAULT_FB_WIDTH, DEFAULT_FB_HEIGHT), + }; // Update state with actual dimensions unsafe { @@ -774,6 +778,19 @@ pub fn flush_rect(x: u32, y: u32, width: u32, height: u32) -> Result<(), &'stati }) } +/// Send only a RESOURCE_FLUSH command without TRANSFER_TO_HOST_2D. +/// +/// Used in GOP hybrid mode where pixels are already in BAR0 (the display +/// scanout memory). The RESOURCE_FLUSH tells Parallels which region changed +/// so it can update the host window, without the overhead of a DMA transfer +/// from PCI_FRAMEBUFFER (which isn't used in hybrid mode). +pub fn resource_flush_only(x: u32, y: u32, width: u32, height: u32) -> Result<(), &'static str> { + with_device_state(|state| { + fence(Ordering::SeqCst); + resource_flush_cmd(state, x, y, width, height) + }) +} + /// Get the framebuffer dimensions. pub fn dimensions() -> Option<(u32, u32)> { unsafe { diff --git a/kernel/src/fs/procfs/mod.rs b/kernel/src/fs/procfs/mod.rs index cbb28975..a08fd0a0 100644 --- a/kernel/src/fs/procfs/mod.rs +++ b/kernel/src/fs/procfs/mod.rs @@ -42,6 +42,7 @@ use alloc::vec::Vec; use spin::Mutex; mod trace; +#[cfg(target_arch = "aarch64")] mod xhci; /// Procfs entry types @@ -451,7 +452,10 @@ pub fn read_entry(entry_type: ProcEntryType) -> Result { let entries = list_xhci_entries(); Ok(entries.join("\n") + "\n") } + #[cfg(target_arch = "aarch64")] ProcEntryType::XhciTrace => Ok(xhci::generate_xhci_trace()), + #[cfg(not(target_arch = "aarch64"))] + ProcEntryType::XhciTrace => Ok(String::from("")), ProcEntryType::BreenixDir => { // Directory listing Ok(String::from("testing\n")) diff --git a/kernel/src/graphics/arm64_fb.rs b/kernel/src/graphics/arm64_fb.rs index 8e13831e..3f26cbb4 100644 --- a/kernel/src/graphics/arm64_fb.rs +++ b/kernel/src/graphics/arm64_fb.rs @@ -128,7 +128,10 @@ pub fn take_dirty_rect() -> Option<(u32, u32, u32, u32)> { /// For VirtIO GPU, this issues transfer_to_host + resource_flush commands. pub fn flush_dirty_rect(x: u32, y: u32, w: u32, h: u32) -> Result<(), &'static str> { if is_gop_active() { - // GOP: writes go directly to display memory. DSB ensures visibility. + // GOP: pixels are in BAR0 (display scanout memory). DSB ensures the + // CPU's write buffer is drained so stores are visible to the display + // controller. Parallels scans BAR0 at its own refresh rate — no VirtIO + // RESOURCE_FLUSH needed (it's synchronous and would add 10-50ms). unsafe { core::arch::asm!("dsb sy", options(nostack, preserves_flags)); } Ok(()) } else if crate::drivers::virtio::gpu_pci::is_initialized() { @@ -180,7 +183,7 @@ pub fn is_gop_active() -> bool { /// Get the GOP framebuffer as a mutable byte slice. /// Returns None if GOP is not initialized. -fn gop_framebuffer() -> Option<&'static mut [u8]> { +pub fn gop_framebuffer() -> Option<&'static mut [u8]> { let ptr = GOP_FB_PTR.load(Ordering::Relaxed); let len = GOP_FB_LEN.load(Ordering::Relaxed); if ptr == 0 || len == 0 { @@ -242,7 +245,7 @@ pub fn init_gop_framebuffer() -> Result<(), &'static str> { }; // Initialize SHELL_FRAMEBUFFER - let shell_fb = ShellFrameBuffer { fb }; + let shell_fb = ShellFrameBuffer { fb, double_buffer: None }; // Cache immutable dimensions for lock-free access by sys_fbinfo let _ = FB_INFO_CACHE.try_init_once(|| FbInfoCache { @@ -315,7 +318,7 @@ pub fn init_gpu_pci_gop_framebuffer() -> Result<(), &'static str> { is_bgr_flag: true, // B8G8R8A8_UNORM }; - let shell_fb = ShellFrameBuffer { fb }; + let shell_fb = ShellFrameBuffer { fb, double_buffer: None }; let _ = FB_INFO_CACHE.try_init_once(|| FbInfoCache { width, @@ -603,6 +606,8 @@ pub fn clear_screen(color: Color) -> Result<(), &'static str> { pub struct ShellFrameBuffer { /// The underlying framebuffer fb: Arm64FrameBuffer, + /// Double buffer: shadow buffer in cached RAM, flushed to hardware (GOP BAR0) + double_buffer: Option, } impl ShellFrameBuffer { @@ -610,6 +615,7 @@ impl ShellFrameBuffer { pub fn new() -> Option { Some(Self { fb: Arm64FrameBuffer::new()?, + double_buffer: None, }) } @@ -644,14 +650,60 @@ impl ShellFrameBuffer { self.fb.flush() } - /// Get double buffer (returns None on ARM64) - /// - /// On ARM64, the VirtIO GPU handles buffering, so we don't need - /// a software double buffer. This method exists for API compatibility. - #[allow(dead_code)] + /// Get mutable access to the double buffer, if available. pub fn double_buffer_mut(&mut self) -> Option<&mut super::double_buffer::DoubleBufferedFrameBuffer> { - // ARM64 VirtIO GPU handles buffering internally - None + self.double_buffer.as_mut() + } + + /// Upgrade to double-buffered rendering. + /// + /// Allocates a shadow buffer in cached heap RAM. All pixel writes go to + /// the shadow buffer (fast ~1ns/write), and `flush_if_dirty()` copies + /// dirty regions to the hardware framebuffer (GOP BAR0). This is critical + /// for Parallels performance where BAR0 writes are ~100ns each. + /// + /// Must be called after heap initialization. + pub fn upgrade_to_double_buffer(&mut self) { + if self.double_buffer.is_some() { + return; + } + + // Only useful for GOP mode where writes go to slow device memory + if !self.fb.is_gop { + return; + } + + let hw_ptr = GOP_FB_PTR.load(Ordering::Relaxed); + let hw_len = GOP_FB_LEN.load(Ordering::Relaxed) as usize; + if hw_ptr == 0 || hw_len == 0 { + return; + } + + let stride_bytes = self.fb.stride * self.fb.bytes_per_pixel; + let db = super::double_buffer::DoubleBufferedFrameBuffer::new( + hw_ptr as *mut u8, + hw_len, + stride_bytes, + self.fb.height, + ); + + // Copy current hardware buffer content to shadow buffer so existing + // screen content (split-screen layout, text) is preserved. + self.double_buffer = Some(db); + + if let Some(ref mut db) = self.double_buffer { + let shadow = db.buffer_mut(); + let src = hw_ptr as *const u8; + let copy_len = shadow.len().min(hw_len); + unsafe { + core::ptr::copy_nonoverlapping(src, shadow.as_mut_ptr(), copy_len); + } + } + + crate::serial_println!( + "[arm64-fb] Upgraded to double buffering: {}x{} shadow buffer ({} KB)", + self.fb.width, self.fb.height, hw_len / 1024 + ); } } @@ -677,19 +729,83 @@ impl Canvas for ShellFrameBuffer { } fn set_pixel(&mut self, x: i32, y: i32, color: Color) { - self.fb.set_pixel(x, y, color); + if let Some(ref mut db) = self.double_buffer { + // Write to shadow buffer (fast cached RAM) + if x < 0 || y < 0 { + return; + } + let x = x as usize; + let y = y as usize; + if x >= self.fb.width || y >= self.fb.height { + return; + } + let bpp = self.fb.bytes_per_pixel; + let pixel_bytes = color.to_pixel_bytes(bpp, self.fb.is_bgr_flag); + let offset = (y * self.fb.stride + x) * bpp; + let shadow = db.buffer_mut(); + if offset + bpp <= shadow.len() { + shadow[offset..offset + bpp].copy_from_slice(&pixel_bytes[..bpp]); + db.mark_region_dirty(y, offset, offset + bpp); + } + } else { + self.fb.set_pixel(x, y, color); + } } fn get_pixel(&self, x: i32, y: i32) -> Option { - self.fb.get_pixel(x, y) + if let Some(ref db) = self.double_buffer { + // Read from shadow buffer (fast cached RAM) + if x < 0 || y < 0 { + return None; + } + let x = x as usize; + let y = y as usize; + if x >= self.fb.width || y >= self.fb.height { + return None; + } + let bpp = self.fb.bytes_per_pixel; + let offset = (y * self.fb.stride + x) * bpp; + let buffer = db.buffer(); + if offset + bpp > buffer.len() { + return None; + } + Some(Color::from_pixel_bytes( + &buffer[offset..offset + bpp], + bpp, + self.fb.is_bgr_flag, + )) + } else { + self.fb.get_pixel(x, y) + } } fn buffer_mut(&mut self) -> &mut [u8] { - self.fb.buffer_mut() + if let Some(ref mut db) = self.double_buffer { + db.buffer_mut() + } else { + self.fb.buffer_mut() + } } fn buffer(&self) -> &[u8] { - self.fb.buffer() + if let Some(ref db) = self.double_buffer { + db.buffer() + } else { + self.fb.buffer() + } + } + + fn mark_dirty_region(&mut self, x: usize, y: usize, width: usize, height: usize) { + if let Some(ref mut db) = self.double_buffer { + let bpp = self.fb.bytes_per_pixel; + let stride_bytes = self.fb.stride * bpp; + let x_start = (x * bpp).min(stride_bytes); + let x_end = ((x + width) * bpp).min(stride_bytes); + let y_end = (y + height).min(self.fb.height); + db.mark_region_dirty_rect(y, y_end, x_start, x_end); + } + // Also mark the atomic dirty rect for VirtIO GPU flush hint + mark_dirty(x as u32, y as u32, width as u32, height as u32); } } @@ -737,6 +853,17 @@ pub fn init_shell_framebuffer() -> Result<(), &'static str> { Ok(()) } +/// Upgrade the shell framebuffer to double-buffered rendering. +/// +/// Allocates a shadow buffer in cached heap RAM. Must be called after +/// heap initialization. Safe to call multiple times — only upgrades once. +pub fn upgrade_to_double_buffer() { + if let Some(fb) = SHELL_FRAMEBUFFER.get() { + let mut guard = fb.lock(); + guard.upgrade_to_double_buffer(); + } +} + /// Get the framebuffer dimensions pub fn dimensions() -> Option<(usize, usize)> { SHELL_FRAMEBUFFER.get().and_then(|fb| { diff --git a/kernel/src/graphics/font.rs b/kernel/src/graphics/font.rs index 04d73527..fa8b9633 100644 --- a/kernel/src/graphics/font.rs +++ b/kernel/src/graphics/font.rs @@ -155,6 +155,11 @@ impl Glyph { self.rasterized.height() } + /// Get the raw raster data (rows of intensity bytes). + pub fn raster(&self) -> &[&[u8]] { + self.rasterized.raster() + } + /// Iterate over the glyph pixels with coordinates and intensity. /// Yields (x, y, intensity) for each pixel. pub fn pixels(&self) -> impl Iterator + '_ { diff --git a/kernel/src/graphics/primitives.rs b/kernel/src/graphics/primitives.rs index 97827bba..8a665591 100644 --- a/kernel/src/graphics/primitives.rs +++ b/kernel/src/graphics/primitives.rs @@ -495,6 +495,17 @@ pub fn draw_char(canvas: &mut impl Canvas, x: i32, y: i32, c: char, style: &Text /// Draw a glyph at the specified position with the given style. fn draw_glyph(canvas: &mut impl Canvas, x: i32, y: i32, glyph: &Glyph, style: &TextStyle) { + // Fast path: glyph fully within canvas bounds — write directly to buffer + if x >= 0 && y >= 0 + && (x as usize + glyph.width()) <= canvas.width() + && (y as usize + glyph.height()) <= canvas.height() + { + draw_glyph_direct(canvas, x as usize, y as usize, glyph, style); + canvas.mark_dirty_region(x as usize, y as usize, glyph.width(), glyph.height()); + return; + } + + // Slow path: per-pixel with bounds checking for edge cases for (gx, gy, intensity) in glyph.pixels() { if intensity == 0 { continue; @@ -504,14 +515,11 @@ fn draw_glyph(canvas: &mut impl Canvas, x: i32, y: i32, glyph: &Glyph, style: &T let py = y + gy as i32; let color = if let Some(bg) = style.background { - // Explicit background - blend foreground with specified background blend_colors(style.foreground, bg, intensity) } else { - // No explicit background - blend with actual canvas pixel for proper anti-aliasing if let Some(existing) = canvas.get_pixel(px, py) { blend_colors(style.foreground, existing, intensity) } else { - // Out of bounds, skip continue; } }; @@ -519,12 +527,78 @@ fn draw_glyph(canvas: &mut impl Canvas, x: i32, y: i32, glyph: &Glyph, style: &T canvas.set_pixel(px, py, color); } - // Mark the entire glyph bounding box dirty once (not per-pixel) if x >= 0 && y >= 0 { canvas.mark_dirty_region(x as usize, y as usize, glyph.width(), glyph.height()); } } +/// Fast-path glyph rendering: writes directly to buffer_mut() without per-pixel +/// bounds checks or function calls. The caller must ensure the glyph is fully +/// within canvas bounds. +fn draw_glyph_direct(canvas: &mut impl Canvas, x: usize, y: usize, glyph: &Glyph, style: &TextStyle) { + let bpp = canvas.bytes_per_pixel(); + let stride = canvas.stride(); + let is_bgr = canvas.is_bgr(); + let stride_bytes = stride * bpp; + let glyph_w = glyph.width(); + + // Pre-compute foreground pixel bytes + let fg_bytes = style.foreground.to_pixel_bytes(bpp, is_bgr); + + if let Some(bg) = style.background { + // With explicit background: pre-blend common intensity values + let bg_bytes = bg.to_pixel_bytes(bpp, is_bgr); + let buffer = canvas.buffer_mut(); + + for (gy, row) in glyph.raster().iter().enumerate() { + let row_offset = (y + gy) * stride_bytes + x * bpp; + if row_offset + glyph_w * bpp > buffer.len() { + break; + } + + for (gx, &intensity) in row.iter().take(glyph_w).enumerate() { + let offset = row_offset + gx * bpp; + if intensity == 0 { + buffer[offset..offset + bpp].copy_from_slice(&bg_bytes[..bpp]); + } else if intensity == 255 { + buffer[offset..offset + bpp].copy_from_slice(&fg_bytes[..bpp]); + } else { + let blended = blend_colors(style.foreground, bg, intensity); + let px = blended.to_pixel_bytes(bpp, is_bgr); + buffer[offset..offset + bpp].copy_from_slice(&px[..bpp]); + } + } + } + } else { + // No explicit background: read existing pixels, blend, write back. + // We read the background region first, then blend and write. + let buffer = canvas.buffer_mut(); + + for (gy, row) in glyph.raster().iter().enumerate() { + let row_offset = (y + gy) * stride_bytes + x * bpp; + if row_offset + glyph_w * bpp > buffer.len() { + break; + } + + for (gx, &intensity) in row.iter().take(glyph_w).enumerate() { + if intensity == 0 { + continue; + } + let offset = row_offset + gx * bpp; + if intensity == 255 { + buffer[offset..offset + bpp].copy_from_slice(&fg_bytes[..bpp]); + } else { + // Read existing pixel, blend + let existing = Color::from_pixel_bytes(&buffer[offset..offset + bpp], bpp, is_bgr); + let blended = blend_colors(style.foreground, existing, intensity); + let px = blended.to_pixel_bytes(bpp, is_bgr); + buffer[offset..offset + bpp].copy_from_slice(&px[..bpp]); + } + } + } + } +} + /// Draw a text string at the specified position. /// /// Handles newlines by moving to the next line. diff --git a/kernel/src/graphics/render_task.rs b/kernel/src/graphics/render_task.rs index e0981ece..02008f67 100644 --- a/kernel/src/graphics/render_task.rs +++ b/kernel/src/graphics/render_task.rs @@ -211,16 +211,20 @@ fn flush_framebuffer() -> bool { } #[cfg(target_arch = "aarch64")] { - // Only flush if pixels have changed. The dirty rect is set by: - // - sys_fbdraw (syscall path, after fast pixel copies) - // - particles thread (after rendering) - // - cursor updates (above) - // - render_queue/split_screen text rendering - // + // First, flush the double buffer (shadow → BAR0 copy) if present. + // This must happen while holding SHELL_FRAMEBUFFER so the shadow + // buffer isn't modified mid-copy. + if let Some(fb) = crate::graphics::arm64_fb::SHELL_FRAMEBUFFER.get() { + if let Some(mut fb_guard) = fb.try_lock() { + if let Some(db) = fb_guard.double_buffer_mut() { + db.flush_if_dirty(); + } + } + } + + // Then flush dirty regions to the display (DSB + optional VirtIO hint). // No SHELL_FRAMEBUFFER lock needed here — we're not touching the pixel - // buffer, just submitting GPU commands via gpu_mmio. This eliminates the - // two-lock nesting (SHELL_FRAMEBUFFER + GPU_LOCK) that caused deadlocks - // when sys_fbdraw held SHELL_FRAMEBUFFER with IRQs disabled. + // buffer, just submitting GPU commands. if let Some((x, y, w, h)) = crate::graphics::arm64_fb::take_dirty_rect() { if let Err(e) = crate::graphics::arm64_fb::flush_dirty_rect(x, y, w, h) { crate::serial_println!("[render] GPU flush failed: {}", e); diff --git a/kernel/src/graphics/split_screen.rs b/kernel/src/graphics/split_screen.rs index 81bd7d96..cadc87f7 100644 --- a/kernel/src/graphics/split_screen.rs +++ b/kernel/src/graphics/split_screen.rs @@ -238,8 +238,9 @@ pub fn write_char_to_terminal(c: char) -> bool { if let Some(db) = fb_guard.double_buffer_mut() { db.flush_if_dirty(); } - #[cfg(target_arch = "aarch64")] - super::arm64_fb::mark_full_dirty(); + // ARM64: dirty regions are tracked automatically via + // Canvas::mark_dirty_region() in the primitives layer. + // The render thread flushes dirty rects periodically. return true; } @@ -266,8 +267,8 @@ pub fn write_str_to_terminal(s: &str) -> bool { if let Some(db) = fb_guard.double_buffer_mut() { db.flush_if_dirty(); } - #[cfg(target_arch = "aarch64")] - super::arm64_fb::mark_full_dirty(); + // ARM64: dirty regions are tracked automatically via + // Canvas::mark_dirty_region() in the primitives layer. return true; } @@ -291,8 +292,7 @@ pub fn toggle_terminal_cursor() { if let Some(db) = fb_guard.double_buffer_mut() { db.flush_if_dirty(); } - #[cfg(target_arch = "aarch64")] - super::arm64_fb::mark_full_dirty(); + // ARM64: dirty regions are tracked automatically. } } } diff --git a/kernel/src/main_aarch64.rs b/kernel/src/main_aarch64.rs index 68712347..512f3835 100644 --- a/kernel/src/main_aarch64.rs +++ b/kernel/src/main_aarch64.rs @@ -567,6 +567,13 @@ pub extern "C" fn kernel_main(hw_config_ptr: u64) -> ! { false }; + // Upgrade framebuffer to double buffering now that heap is available. + // This allocates a shadow buffer in cached RAM so pixel writes are fast + // (~1ns vs ~100ns for direct GOP BAR0 writes on Parallels). + if has_display { + kernel::graphics::arm64_fb::upgrade_to_double_buffer(); + } + // Initialize input devices (capability-based detection) if kernel::drivers::usb::xhci::is_initialized() { // USB HID keyboard/mouse via XHCI — already set up during drivers::init() diff --git a/kernel/src/syscall/graphics.rs b/kernel/src/syscall/graphics.rs index d8ea65e0..3e4f3f40 100644 --- a/kernel/src/syscall/graphics.rs +++ b/kernel/src/syscall/graphics.rs @@ -545,24 +545,63 @@ pub fn sys_fbdraw(cmd_ptr: u64) -> SyscallResult { (0, mmap_info.height) }; - // Copy dirty rows from user buffer → GPU framebuffer at correct x_offset. - // ARM64 has no double buffer; writes go directly to VirtIO GPU memory. let fb_stride_bytes = fb_guard.stride() * fb_guard.bytes_per_pixel(); let row_bytes = mmap_info.width * mmap_info.bpp; let x_byte_offset = mmap_info.x_offset * mmap_info.bpp; - let gpu_buf = fb_guard.buffer_mut(); - - for y in y_start..y_end { - let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride; - let gpu_row_offset = y * fb_stride_bytes + x_byte_offset; - - if gpu_row_offset + row_bytes <= gpu_buf.len() { - unsafe { - core::ptr::copy_nonoverlapping( - user_row_ptr as *const u8, - gpu_buf[gpu_row_offset..].as_mut_ptr(), - row_bytes, - ); + + // For GOP mode with double buffer, copy user → BAR0 directly + // (single copy) instead of user → shadow → BAR0 (double copy). + // The NC memory mapping makes BAR0 writes fast via write-combining. + // Also update the shadow buffer so terminal reads stay consistent. + if crate::graphics::arm64_fb::is_gop_active() { + if let Some(gop_buf) = crate::graphics::arm64_fb::gop_framebuffer() { + for y in y_start..y_end { + let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride; + let target_row_offset = y * fb_stride_bytes + x_byte_offset; + + if target_row_offset + row_bytes <= gop_buf.len() { + unsafe { + core::ptr::copy_nonoverlapping( + user_row_ptr as *const u8, + gop_buf[target_row_offset..].as_mut_ptr(), + row_bytes, + ); + } + } + } + // Also update shadow buffer to keep it consistent + if let Some(db) = fb_guard.double_buffer_mut() { + let shadow = db.buffer_mut(); + for y in y_start..y_end { + let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride; + let target_row_offset = y * fb_stride_bytes + x_byte_offset; + if target_row_offset + row_bytes <= shadow.len() { + unsafe { + core::ptr::copy_nonoverlapping( + user_row_ptr as *const u8, + shadow[target_row_offset..].as_mut_ptr(), + row_bytes, + ); + } + } + } + } + } + } else { + // Non-GOP path: copy to GPU buffer (VirtIO MMIO/PCI framebuffer) + let target_buf = fb_guard.buffer_mut(); + for y in y_start..y_end { + let user_row_ptr = (mmap_info.user_addr as usize) + y * mmap_info.user_stride; + let target_row_offset = y * fb_stride_bytes + x_byte_offset; + + if target_row_offset + row_bytes <= target_buf.len() { + unsafe { + core::ptr::copy_nonoverlapping( + user_row_ptr as *const u8, + target_buf[target_row_offset..].as_mut_ptr(), + row_bytes, + ); + } } } } @@ -574,11 +613,10 @@ pub fn sys_fbdraw(cmd_ptr: u64) -> SyscallResult { // for terminal text while we submit GPU commands. drop(fb_guard); - // Synchronous GPU flush — submit transfer_to_host + resource_flush - // directly in the syscall instead of deferring to the render thread. - // This eliminates scheduling latency: bounce's frame is displayed - // immediately rather than waiting for the render thread to wake up - // (which could take 5ms+ due to timer tick granularity). + // Synchronous GPU flush — submit resource_flush (or transfer_to_host + + // resource_flush for non-GOP) directly in the syscall. This eliminates + // scheduling latency: bounce's frame is displayed immediately rather + // than waiting for the render thread (5ms+ due to timer tick). if let Some((fx, fy, fw, fh)) = flush_rect { let _ = crate::graphics::arm64_fb::flush_dirty_rect(fx, fy, fw, fh); } diff --git a/parallels-loader/src/page_tables.rs b/parallels-loader/src/page_tables.rs index 5e288bb1..8ba87522 100644 --- a/parallels-loader/src/page_tables.rs +++ b/parallels-loader/src/page_tables.rs @@ -177,13 +177,23 @@ pub fn build_page_tables(storage: &mut PageTableStorage) -> (u64, u64) { // TTBR1 L1[0] -> L2 (HHDM + 0x00000000 - 0x3FFFFFFF) write_entry(ttbr1_l1, 0, ttbr1_l2_dev | attr::TABLE_DESC); - // Map all 2MB blocks in 0x00000000-0x3FFFFFFF as device memory + // Map all 2MB blocks in 0x00000000-0x3FFFFFFF as device memory, + // except for the GOP BAR0 framebuffer region (0x10000000-0x10FFFFFF, + // L2 indices 128-135) which uses Normal-NC for write-combining. + // Write-combining allows the CPU to coalesce/reorder stores, making + // bulk memcpy from shadow buffer to BAR0 significantly faster while + // maintaining coherency with the hypervisor's display scanout. // This covers: GIC (0x02010000), UART (0x02110000), PCI ECAM (0x02300000), // GICR (0x02500000), PCI MMIO (0x10000000-0x1FFFFFFF) for i in 0..512u64 { let phys = i * 0x20_0000; // 2MB blocks - write_entry(ttbr0_l2_dev, i as usize, phys | attr::DEVICE_BLOCK); - write_entry(ttbr1_l2_dev, i as usize, phys | attr::DEVICE_BLOCK); + let block_attr = if (128..136).contains(&i) { + attr::NC_BLOCK // GOP BAR0: write-combining for framebuffer + } else { + attr::DEVICE_BLOCK + }; + write_entry(ttbr0_l2_dev, i as usize, phys | block_attr); + write_entry(ttbr1_l2_dev, i as usize, phys | block_attr); } // --- RAM: 0x40000000 - 0xBFFFFFFF (2GB, L1 entries 1-2) ---