diff --git a/Cargo.lock b/Cargo.lock index c7528121ff2..61f68437b7a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,6 +47,38 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +[[package]] +name = "argh" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34ff18325c8a36b82f992e533ece1ec9f9a9db446bd1c14d4f936bac88fcd240" +dependencies = [ + "argh_derive", + "argh_shared", + "rust-fuzzy-search", +] + +[[package]] +name = "argh_derive" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb7b2b83a50d329d5d8ccc620f5c7064028828538bdf5646acd60dc1f767803" +dependencies = [ + "argh_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "argh_shared" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a464143cc82dedcdc3928737445362466b7674b5db4e2eb8e869846d6d84f4f6" +dependencies = [ + "serde", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -355,6 +387,16 @@ dependencies = [ "stdext", ] +[[package]] +name = "lsh-bin" +version = "0.0.0" +dependencies = [ + "anyhow", + "argh", + "lsh", + "stdext", +] + [[package]] name = "memchr" version = "2.7.6" @@ -514,6 +556,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "rust-fuzzy-search" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a157657054ffe556d8858504af8a672a054a6e0bd9e8ee531059100c0fa11bb2" + [[package]] name = "rustversion" version = "1.0.22" diff --git a/crates/edit/benches/lib.rs b/crates/edit/benches/lib.rs index 2141e9029ef..af1639c27b2 100644 --- a/crates/edit/benches/lib.rs +++ b/crates/edit/benches/lib.rs @@ -7,9 +7,10 @@ use std::{mem, vec}; use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; use edit::helpers::*; -use edit::{buffer, glob, hash, json, oklab, simd, unicode}; +use edit::{buffer, hash, json, oklab, simd, unicode}; use stdext::arena::{self, scratch_arena}; use stdext::collections::BVec; +use stdext::glob; use stdext::unicode::Utf8Chars; struct EditingTracePatch<'a>(usize, usize, &'a str); diff --git a/crates/edit/src/lib.rs b/crates/edit/src/lib.rs index 6bb731cfd5a..72cddd83343 100644 --- a/crates/edit/src/lib.rs +++ b/crates/edit/src/lib.rs @@ -15,7 +15,6 @@ pub mod clipboard; pub mod document; pub mod framebuffer; pub mod fuzzy; -pub mod glob; pub mod hash; pub mod helpers; pub mod icu; diff --git a/crates/lsh-bin/Cargo.toml b/crates/lsh-bin/Cargo.toml new file mode 100644 index 00000000000..8601fe1960e --- /dev/null +++ b/crates/lsh-bin/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "lsh-bin" +version = "0.0.0" + +edition.workspace = true +license.workspace = true +repository.workspace = true +rust-version.workspace = true + +[dependencies] +anyhow = "*" +argh = "*" +lsh.workspace = true +stdext.workspace = true diff --git a/crates/lsh-bin/src/main.rs b/crates/lsh-bin/src/main.rs new file mode 100644 index 00000000000..6e6a4ae7faf --- /dev/null +++ b/crates/lsh-bin/src/main.rs @@ -0,0 +1,188 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use std::fs::File; +use std::io::{BufRead, BufReader, BufWriter, IsTerminal, Write as _, stdout}; +use std::path::{Path, PathBuf}; +use std::process::exit; + +use anyhow::bail; +use argh::FromArgs; +use lsh::compiler::SerializedCharset; +use lsh::runtime::Runtime; +use stdext::arena::scratch_arena; +use stdext::glob::glob_match; + +#[derive(FromArgs, PartialEq, Debug)] +#[argh(description = "Debug and test frontend for Leonard's Shitty Highlighter")] +struct Command { + #[argh(subcommand)] + sub: SubCommands, +} + +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand)] +enum SubCommands { + Compile(SubCommandOneCompile), + Assembly(SubCommandAssembly), + Render(SubCommandRender), +} + +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "compile", description = "Generate Rust code from .lsh files")] +struct SubCommandOneCompile { + #[argh(positional, description = "source .lsh file or directory")] + lsh: PathBuf, +} + +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "assembly", description = "Generate assembly from .lsh files")] +struct SubCommandAssembly { + #[argh(positional, description = "source .lsh file or directory")] + lsh: PathBuf, +} + +#[derive(FromArgs, PartialEq, Debug)] +#[argh(subcommand, name = "render", description = "Highlight text files")] +struct SubCommandRender { + #[argh(positional, description = "source .lsh file or directory")] + lsh: PathBuf, + #[argh(positional, description = "source text file")] + input: PathBuf, +} + +pub fn main() { + if let Err(e) = run() { + eprintln!("{e}"); + exit(1); + } +} + +fn run() -> anyhow::Result<()> { + stdext::arena::init(128 * 1024 * 1024).unwrap(); + + let command: Command = argh::from_env(); + let scratch = scratch_arena(None); + let mut generator = lsh::compiler::Generator::new(&scratch); + let mut read_lsh = |path: &Path| { + if path.is_dir() { generator.read_directory(path) } else { generator.read_file(path) } + }; + + match &command.sub { + SubCommands::Compile(cmd) => { + read_lsh(&cmd.lsh)?; + let output = generator.generate_rust()?; + _ = stdout().write_all(output.as_bytes()); + } + SubCommands::Assembly(cmd) => { + read_lsh(&cmd.lsh)?; + let vt = stdout().is_terminal(); + let output = generator.generate_assembly(vt)?; + _ = stdout().write_all(output.as_bytes()); + } + SubCommands::Render(cmd) => { + read_lsh(&cmd.lsh)?; + run_render(generator, &cmd.input)?; + } + } + + Ok(()) +} + +fn run_render(generator: lsh::compiler::Generator, path: &Path) -> anyhow::Result<()> { + let assembly = generator.assemble()?; + + let Some(entrypoint) = assembly.entrypoints.iter().find(|ep| { + ep.paths + .iter() + .any(|pattern| glob_match(pattern.as_bytes(), path.as_os_str().as_encoded_bytes())) + }) else { + bail!("No matching highlighting definition found"); + }; + + let mut color_map = Vec::new(); + let mut unknown_kinds = Vec::new(); + for hk in &assembly.highlight_kinds { + let color = match hk.identifier { + "other" => "", + + "comment" => "\x1b[32m", // Green + "method" => "\x1b[93m", // Bright Yellow + "string" => "\x1b[91m", // Bright Red + "variable" => "\x1b[96m", // Bright Cyan + + "constant.language" => "\x1b[94m", // Bright Blue + "constant.numeric" => "\x1b[92m", // Bright Green + "keyword.control" => "\x1b[95m", // Bright Magenta + "keyword.other" => "\x1b[94m", // Bright Blue + "markup.bold" => "\x1b[1m", // Bold + "markup.changed" => "\x1b[94m", // Bright Blue + "markup.deleted" => "\x1b[91m", // Bright Red + "markup.heading" => "\x1b[94m", // Bright Blue + "markup.inserted" => "\x1b[92m", // Bright Green + "markup.italic" => "\x1b[3m", // Italic + "markup.link" => "\x1b[4m", // Underlined + "markup.list" => "\x1b[94m", // Bright Blue + "markup.strikethrough" => "\x1b[9m", // Strikethrough + "meta.header" => "\x1b[94m", // Bright Blue + + _ => { + unknown_kinds.push(hk.identifier.to_string()); + "" + } + }; + + if !color.is_empty() { + if color_map.len() <= hk.value as usize { + color_map.resize(hk.value as usize + 1, ""); + } + color_map[hk.value as usize] = color; + } + } + if !unknown_kinds.is_empty() { + eprintln!("\x1b[33mWarning: Unknown highlight kinds:"); + for kind in &unknown_kinds { + eprintln!(" - {}", kind); + } + eprintln!("\x1b[m"); + } + + // Convert Assembly data to static references by leaking memory + // This is fine for a CLI tool that runs once and exits + let charsets: Vec = + assembly.charsets.into_iter().map(|cs| cs.serialize()).collect(); + + let mut runtime = Runtime::new( + &assembly.instructions, + &assembly.strings, + &charsets, + entrypoint.address as u32, + ); + + let reader = BufReader::with_capacity(128 * 1024, File::open(path)?); + let mut stdout = BufWriter::with_capacity(128 * 1024, stdout()); + + for line in reader.lines() { + let line = line?; + let scratch = scratch_arena(None); + let highlights = runtime.parse_next_line::(&scratch, line.as_bytes()); + + for w in highlights.windows(2) { + let curr = &w[0]; + let next = &w[1]; + let start = curr.start; + let end = next.start; + let kind = curr.kind; + let text = &line[start..end]; + + if let Some(color) = color_map.get(kind as usize) { + write!(stdout, "{color}{text}\x1b[m")?; + } else { + stdout.write_all(text.as_bytes())?; + } + } + writeln!(stdout)?; + } + + Ok(()) +} diff --git a/crates/lsh/src/compiler/backend.rs b/crates/lsh/src/compiler/backend.rs index a43ee40585b..cbe6f84c51c 100644 --- a/crates/lsh/src/compiler/backend.rs +++ b/crates/lsh/src/compiler/backend.rs @@ -522,6 +522,7 @@ impl<'a> LivenessAnalysis<'a> { let ir = cell.borrow(); + #[allow(clippy::collapsible_match)] match ir.instr { IRI::Mov { dst, src } => { if dst.borrow().physical.is_none() { diff --git a/crates/lsh/src/runtime.rs b/crates/lsh/src/runtime.rs index 3bfd1f07c78..c3af7aac4a4 100644 --- a/crates/lsh/src/runtime.rs +++ b/crates/lsh/src/runtime.rs @@ -22,7 +22,8 @@ //! - [`Instruction::address_offset`] returns where, within an instruction, the jump target lives, //! as used by the backend's relocation system. -use std::fmt; +use std::fmt::{self, Debug}; +use std::mem; use stdext::arena::Arena; use stdext::arena_write_fmt; @@ -55,12 +56,303 @@ pub struct Highlight { pub kind: T, } -impl fmt::Debug for Highlight { +impl Debug for Highlight { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "({}, {:?})", self.start, self.kind) } } +/// The bytecode interpreter for syntax highlighting. +#[derive(Clone)] +pub struct Runtime<'pa, 'ps, 'pc> { + assembly: &'pa [u8], + strings: &'ps [&'ps str], + charsets: &'pc [[u16; 16]], + entrypoint: u32, + stack: Vec, + registers: Registers, +} + +/// Snapshot of the runtime state for incremental re-highlighting. +#[derive(Clone)] +pub struct RuntimeState { + stack: Vec, + registers: Registers, +} + +impl<'pa, 'ps, 'pc> Runtime<'pa, 'ps, 'pc> { + pub fn new( + assembly: &'pa [u8], + strings: &'ps [&'ps str], + charsets: &'pc [[u16; 16]], + entrypoint: u32, + ) -> Self { + Runtime { + assembly, + strings, + charsets, + entrypoint, + stack: Default::default(), + registers: Registers { pc: entrypoint, ..Default::default() }, + } + } + + pub fn snapshot(&self) -> RuntimeState { + RuntimeState { stack: self.stack.clone(), registers: self.registers } + } + + pub fn restore(&mut self, state: &RuntimeState) { + self.stack = state.stack.clone(); + self.registers = state.registers; + } + + /// Parse a single line and return highlight spans. + /// + /// Executes bytecode until the line is fully consumed or a `Return` resets the VM. + /// The returned spans partition the line into highlighted regions. + /// + /// # Returns + /// A vector of [`Highlight`] spans. Always contains at least two spans: + /// one at offset 0 and one at `line.len()` as a sentinel. + pub fn parse_next_line<'a, T: PartialEq + TryFrom>( + &mut self, + arena: &'a Arena, + line: &[u8], + ) -> BVec<'a, Highlight> { + let mut res: BVec<'a, Highlight> = BVec::empty(); + + self.registers.off = 0; + self.registers.hs = 0; + + // By default, any line starts with HighlightKind::Other. + // If the DSL yields anything, this will be overwritten. + res.push(arena, Highlight { start: 0, kind: unsafe { mem::zeroed() } }); + + loop { + instruction_decode!(self.assembly, self.registers.pc, { + Mov { dst, src } => { + let s = self.registers.get(src); + self.registers.set(dst, s); + } + Add { dst, src } => { + let d = self.registers.get(dst); + let s = self.registers.get(src); + self.registers.set(dst, d.saturating_add(s)); + } + Sub { dst, src } => { + let d = self.registers.get(dst); + let s = self.registers.get(src); + self.registers.set(dst, d.saturating_sub(s)); + } + MovImm { dst, imm } => { + self.registers.set(dst, imm); + } + AddImm { dst, imm } => { + let d = self.registers.get(dst); + self.registers.set(dst, d.saturating_add(imm)); + } + SubImm { dst, imm } => { + let d = self.registers.get(dst); + self.registers.set(dst, d.saturating_sub(imm)); + } + + Call { tgt } => { + // PC already points to the next instruction (= return address) + self.registers.save_registers(&mut self.stack); + self.registers.pc = tgt; + } + Return => { + if !self.registers.load_registers(&mut self.stack) { + self.registers = Registers { pc: self.entrypoint, ..Default::default() }; + break; + } + } + + JumpEQ { lhs, rhs, tgt } => { + if self.registers.get(lhs) == self.registers.get(rhs) { + self.registers.pc = tgt; + } + } + JumpNE { lhs, rhs, tgt } => { + if self.registers.get(lhs) != self.registers.get(rhs) { + self.registers.pc = tgt; + } + } + JumpLT { lhs, rhs, tgt } => { + if self.registers.get(lhs) < self.registers.get(rhs) { + self.registers.pc = tgt; + } + } + JumpLE { lhs, rhs, tgt } => { + if self.registers.get(lhs) <= self.registers.get(rhs) { + self.registers.pc = tgt; + } + } + JumpGT { lhs, rhs, tgt } => { + if self.registers.get(lhs) > self.registers.get(rhs) { + self.registers.pc = tgt; + } + } + JumpGE { lhs, rhs, tgt } => { + if self.registers.get(lhs) >= self.registers.get(rhs) { + self.registers.pc = tgt; + } + } + + JumpIfEndOfLine { tgt } => { + if (self.registers.off as usize) >= line.len() { + self.registers.pc = tgt; + } + } + + JumpIfMatchCharset { idx, min, max, tgt } => { + let off = self.registers.off as usize; + let cs = &self.charsets[idx as usize]; + let min = min as usize; + let max = max as usize; + + if let Some(off) = Self::charset_gobble(line, off, cs, min, max) { + self.registers.off = off as u32; + self.registers.pc = tgt; + } + } + JumpIfMatchPrefix { idx, tgt } => { + let off = self.registers.off as usize; + let str = self.strings[idx as usize].as_bytes(); + + if Self::inlined_memcmp(line, off, str) { + self.registers.off = (off + str.len()) as u32; + self.registers.pc = tgt; + } + } + JumpIfMatchPrefixInsensitive { idx, tgt } => { + let off = self.registers.off as usize; + let str = self.strings[idx as usize].as_bytes(); + + if Self::inlined_memicmp(line, off, str) { + self.registers.off = (off + str.len()) as u32; + self.registers.pc = tgt; + } + } + + FlushHighlight { kind } => { + let kind = self.registers.get(kind); + let kind = unsafe { kind.try_into().unwrap_unchecked() }; + let start = (self.registers.hs as usize).min(line.len()); + + if let Some(last) = res.last_mut() + && (last.start == start || last.kind == kind) + { + last.kind = kind; + } else { + res.push(arena, Highlight { start, kind }); + } + + self.registers.hs = self.registers.off; + } + AwaitInput => { + let off = self.registers.off as usize; + if off >= line.len() { + break; + } + } + + _ => unreachable!(), + }); + } + + // Ensure that there's a past-the-end highlight. + if res.last().is_none_or(|last| last.start < line.len()) { + res.push(arena, Highlight { start: line.len(), kind: unsafe { mem::zeroed() } }); + } + + res + } + + // TODO: http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html#alternative-implementation + #[inline] + fn charset_gobble( + haystack: &[u8], + off: usize, + cs: &[u16; 16], + min: usize, + max: usize, + ) -> Option { + let mut i = 0usize; + while i < max { + let idx = off + i; + if idx >= haystack.len() || !Self::in_set(cs, haystack[idx]) { + break; + } + i += 1; + } + if i >= min { Some(off + i) } else { None } + } + + /// A mini-memcmp implementation for short needles. + /// Compares the `haystack` at `off` with the `needle`. + #[inline] + fn inlined_memcmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool { + unsafe { + if off >= haystack.len() || haystack.len() - off < needle.len() { + return false; + } + + let a = haystack.as_ptr().add(off); + let b = needle.as_ptr(); + let mut i = 0; + + while i < needle.len() { + let a = *a.add(i); + let b = *b.add(i); + i += 1; + if a != b { + return false; + } + } + + true + } + } + + /// Like `inlined_memcmp`, but case-insensitive. + #[inline] + fn inlined_memicmp(haystack: &[u8], off: usize, needle: &[u8]) -> bool { + unsafe { + if off >= haystack.len() || haystack.len() - off < needle.len() { + return false; + } + + let a = haystack.as_ptr().add(off); + let b = needle.as_ptr(); + let mut i = 0; + + while i < needle.len() { + // str in PrefixInsensitive(str) is expected to be lowercase, printable ASCII. + let a = a.add(i).read().to_ascii_lowercase(); + let b = b.add(i).read(); + i += 1; + if a != b { + return false; + } + } + + true + } + } + + #[inline] + fn in_set(bitmap: &[u16; 16], byte: u8) -> bool { + let lo_nibble = byte & 0xf; + let hi_nibble = byte >> 4; + + let bitset = bitmap[lo_nibble as usize]; + let bitmask = 1u16 << hi_nibble; + + (bitset & bitmask) != 0 + } +} + #[repr(u8)] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum Register { @@ -154,6 +446,26 @@ impl Registers { unsafe { self.as_mut_ptr().add(reg as usize).write(val) } } + #[inline(always)] + fn save_registers(&self, vec: &mut Vec) { + unsafe { vec.extend_from_slice(std::slice::from_raw_parts(self.as_ptr().add(2), 14)) }; + } + + #[inline(always)] + fn load_registers(&mut self, vec: &mut Vec) -> bool { + unsafe { + if vec.len() < 14 { + return false; + } + + let src = vec.as_ptr().add(vec.len() - 14); + let dst = self.as_mut_ptr().add(2); + std::ptr::copy_nonoverlapping(src, dst, 14); + vec.truncate(vec.len() - 14); + true + } + } + #[inline(always)] unsafe fn as_ptr(&self) -> *const u32 { self as *const _ as *const u32 @@ -403,6 +715,8 @@ macro_rules! instruction_decode { }}; } +use instruction_decode; + impl Instruction { // JumpIfMatchCharset, etc., are 1 byte opcode + 4 u32 parameters. pub const MAX_ENCODED_SIZE: usize = 1 + 4 * 4; diff --git a/crates/edit/src/glob.rs b/crates/stdext/src/glob.rs similarity index 100% rename from crates/edit/src/glob.rs rename to crates/stdext/src/glob.rs diff --git a/crates/stdext/src/lib.rs b/crates/stdext/src/lib.rs index 30b59d54903..e009494acf9 100644 --- a/crates/stdext/src/lib.rs +++ b/crates/stdext/src/lib.rs @@ -6,6 +6,7 @@ pub mod alloc; pub mod arena; pub mod collections; +pub mod glob; mod helpers; pub mod simd; pub mod sys;