diff --git a/NEWS.md b/NEWS.md index 693b46f0..06cd8638 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,45 @@ program: args_list(f_opt(number), opt_tail(string), number) https://github.com/ruby/lrama/pull/779 +### [EXPERIMENTAL] Support the generation of the PSLR(1) parser described in this dissertation + +Added experimental support for generating the PSLR(1) parser described in this dissertation. +https://open.clemson.edu/all_dissertations/519/ + +This adds the following PSLR-related grammar directives and integration points: + +- `%define lr.type pslr` enables PSLR parser generation +- `%token-pattern` declares token candidates and their regular expressions for PSLR-aware lexical disambiguation +- `%lex-prec` declares how overlapping token patterns should be prioritized +- `%define api.pslr.state-member` names the parser-state field to be shared with the lexer when using the generated helper macros + +Typical usage looks like this: + +```yacc +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%parse-param {struct parse_params *p} +%lex-param {struct parse_params *p} + +%token-pattern RSHIFT />>/ "right shift" +%token-pattern RANGLE />/ "right angle" +%token-pattern ID /[a-z]+/ + +%lex-prec RANGLE -s RSHIFT +``` + +In this setup, `%token-pattern` lists the tokens that the PSLR scanner should consider, and `%lex-prec` +resolves conflicts between overlapping matches. For example, `%lex-prec RANGLE -s RSHIFT` tells Lrama to +prefer `RANGLE` over `RSHIFT` when the shorter token should win. + +When the parser and lexer share a context through `%parse-param` / `%lex-param`, the generated header also +provides helpers such as `YYPSLR_PSEUDO_SCAN(...)`, so the lexer can choose a token based on the current parser +state. + +But, currently PSLR(1) parser is experimental feature. If you find any bugs, please report it to us. Thank you. + ## Lrama 0.7.1 (2025-12-24) ### Optimize IELR diff --git a/lib/lrama.rb b/lib/lrama.rb index 56ba0044..c676b32d 100644 --- a/lib/lrama.rb +++ b/lib/lrama.rb @@ -15,8 +15,10 @@ require_relative "lrama/output" require_relative "lrama/parser" require_relative "lrama/reporter" +require_relative "lrama/scanner_fsa" require_relative "lrama/state" require_relative "lrama/states" +require_relative "lrama/length_precedences" require_relative "lrama/tracer" require_relative "lrama/version" require_relative "lrama/warnings" diff --git a/lib/lrama/command.rb b/lib/lrama/command.rb index 17aad1a1..236dca05 100644 --- a/lib/lrama/command.rb +++ b/lib/lrama/command.rb @@ -30,11 +30,11 @@ def execute_command_workflow text = read_input grammar = build_grammar(text) states, context = compute_status(grammar) + states.validate!(@logger) render_reports(states) if @options.report_file @tracer.trace(grammar) render_diagram(grammar) render_output(context, grammar) - states.validate!(@logger) @warnings.warn(grammar, states) end @@ -84,7 +84,11 @@ def prepare_grammar(grammar) def compute_status(grammar) states = Lrama::States.new(grammar, @tracer) states.compute - states.compute_ielr if grammar.ielr_defined? + if grammar.pslr_defined? + states.compute_pslr + elsif grammar.ielr_defined? + states.compute_ielr + end [states, Lrama::Context.new(states)] end diff --git a/lib/lrama/grammar.rb b/lib/lrama/grammar.rb index 95a80bb0..edd4ecdf 100644 --- a/lib/lrama/grammar.rb +++ b/lib/lrama/grammar.rb @@ -20,6 +20,8 @@ require_relative "grammar/symbols" require_relative "grammar/type" require_relative "grammar/union" +require_relative "grammar/token_pattern" +require_relative "grammar/lex_prec" require_relative "lexer" module Lrama @@ -40,6 +42,11 @@ class Grammar # def nterms: () -> Array[Grammar::Symbol] # def find_symbol_by_s_value!: (::String s_value) -> Grammar::Symbol # def ielr_defined?: () -> bool + # def pslr_defined?: () -> bool + # def token_patterns: () -> Array[Grammar::TokenPattern] + # def lex_prec: () -> Grammar::LexPrec + # def pslr_max_states: () -> Integer? + # def pslr_max_state_ratio: () -> Float? # end # # include Symbols::Resolver::_DelegatedMethods @@ -68,6 +75,8 @@ class Grammar # @union: Union # @precedences: Array[Precedence] # @start_nterm: Lrama::Lexer::Token::Base? + # @token_patterns: Array[Grammar::TokenPattern] + # @lex_prec: Grammar::LexPrec extend Forwardable @@ -100,6 +109,8 @@ class Grammar attr_accessor :locations #: bool attr_accessor :define #: Hash[String, String] attr_accessor :required #: bool + attr_reader :token_patterns #: Array[Grammar::TokenPattern] + attr_reader :lex_prec #: Grammar::LexPrec def_delegators "@symbols_resolver", :symbols, :nterms, :terms, :add_nterm, :add_term, :find_term_by_s_value, :find_symbol_by_number!, :find_symbol_by_id!, :token_to_symbol, @@ -133,6 +144,9 @@ def initialize(rule_counter, locations, define = {}) @required = false @precedences = [] @start_nterm = nil + @token_patterns = [] + @lex_prec = Grammar::LexPrec.new + @token_pattern_counter = 0 append_special_symbols end @@ -277,6 +291,7 @@ def validate! validate_no_precedence_for_nterm! validate_rule_lhs_is_nterm! validate_duplicated_precedence! + validate_pslr_configuration! end # @rbs (Grammar::Symbol sym) -> Array[Rule] @@ -304,8 +319,104 @@ def ielr_defined? @define.key?('lr.type') && @define['lr.type'] == 'ielr' end + # @rbs () -> bool + def pslr_defined? + @define.key?('lr.type') && @define['lr.type'] == 'pslr' + end + + # @rbs () -> String? + def pslr_state_member + @define['api.pslr.state-member'] + end + + # @rbs () -> Integer? + def pslr_max_states + parse_pslr_positive_integer('pslr.max-states') + end + + # @rbs () -> Float? + def pslr_max_state_ratio + parse_pslr_positive_float('pslr.max-state-ratio') + end + + # Add a token pattern from %token-pattern directive + # @rbs (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, ?alias_name: String?, ?tag: Lexer::Token::Tag?, lineno: Integer) -> Grammar::TokenPattern + def add_token_pattern(id:, pattern:, alias_name: nil, tag: nil, lineno:) + token_pattern = Grammar::TokenPattern.new( + id: id, + pattern: pattern, + alias_name: alias_name, + tag: tag, + lineno: lineno, + definition_order: @token_pattern_counter + ) + @token_pattern_counter += 1 + @token_patterns << token_pattern + + # Also register as a terminal symbol + add_term(id: id, alias_name: alias_name, tag: tag) + + token_pattern + end + + # Add a lex-prec rule from %lex-prec directive + # @rbs (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> Grammar::LexPrec::Rule + def add_lex_prec_rule(left_token:, operator:, right_token:, lineno:) + @lex_prec.add_rule( + left_token: left_token, + operator: operator, + right_token: right_token, + lineno: lineno + ) + end + + # Find a token pattern by its name + # @rbs (String name) -> Grammar::TokenPattern? + def find_token_pattern(name) + @token_patterns.find { |tp| tp.name == name } + end + private + # @rbs () -> void + def validate_pslr_configuration! + return unless pslr_defined? + + member = pslr_state_member + if member && member !~ /\A[a-zA-Z_][a-zA-Z0-9_]*\z/ + raise %(%define api.pslr.state-member must be a valid C identifier, got "#{member}".) + end + + pslr_max_states + pslr_max_state_ratio + end + + # @rbs (String key) -> Integer? + def parse_pslr_positive_integer(key) + value = @define[key] + return nil if value.nil? || value.empty? + + parsed = Integer(value, 10) + raise %(%define #{key} must be greater than 0, got "#{value}".) unless 0 < parsed + + parsed + rescue ArgumentError + raise %(%define #{key} must be an integer, got "#{value}".) + end + + # @rbs (String key) -> Float? + def parse_pslr_positive_float(key) + value = @define[key] + return nil if value.nil? || value.empty? + + parsed = Float(value) + raise %(%define #{key} must be greater than or equal to 1.0, got "#{value}".) unless 1.0 <= parsed + + parsed + rescue ArgumentError + raise %(%define #{key} must be a number, got "#{value}".) + end + # @rbs () -> void def sort_precedence @precedences.sort_by! do |prec| diff --git a/lib/lrama/grammar/lex_prec.rb b/lib/lrama/grammar/lex_prec.rb new file mode 100644 index 00000000..c5d30da9 --- /dev/null +++ b/lib/lrama/grammar/lex_prec.rb @@ -0,0 +1,98 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Grammar + # Represents lexical precedence rules defined by %lex-prec directive + # Based on Definition 3.2.3, 3.2.4, 3.2.10 from the PSLR dissertation + # + # Example: %lex-prec RANGLE -s RSHIFT # RANGLE is shorter than RSHIFT + # %lex-prec IF - ID # IF has higher priority than ID (same length) + class LexPrec + # Precedence relation types + # "," : Same priority (lex-tie) + # "-" : Left has higher priority than right + # "-s" : Left is shorter match priority over right + SAME_PRIORITY = :same #: Symbol + HIGHER = :higher #: Symbol + SHORTER = :shorter #: Symbol + + # Represents a single precedence rule + class Rule + attr_reader :left_token #: Lexer::Token::Ident + attr_reader :operator #: Symbol + attr_reader :right_token #: Lexer::Token::Ident + attr_reader :lineno #: Integer + + # @rbs (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> void + def initialize(left_token:, operator:, right_token:, lineno:) + @left_token = left_token + @operator = operator + @right_token = right_token + @lineno = lineno + end + + # @rbs () -> String + def left_name + @left_token.s_value + end + + # @rbs () -> String + def right_name + @right_token.s_value + end + end + + attr_reader :rules #: Array[Rule] + + # @rbs () -> void + def initialize + @rules = [] + end + + # @rbs (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> Rule + def add_rule(left_token:, operator:, right_token:, lineno:) + rule = Rule.new( + left_token: left_token, + operator: operator, + right_token: right_token, + lineno: lineno + ) + @rules << rule + rule + end + + # Check if token t1 has higher priority than t2 + # Based on Definition 3.2.4 + # @rbs (String t1, String t2) -> bool + def higher_priority?(t1, t2) + @rules.any? do |rule| + rule.operator == HIGHER && + rule.left_name == t1 && + rule.right_name == t2 + end + end + + # Check if token t1 has shorter-match priority over t2 + # Based on Definition 3.2.15 + # @rbs (String t1, String t2) -> bool + def shorter_priority?(t1, t2) + @rules.any? do |rule| + rule.operator == SHORTER && + rule.left_name == t1 && + rule.right_name == t2 + end + end + + # Check if tokens t1 and t2 are in a lex-tie relationship + # @rbs (String t1, String t2) -> bool + def same_priority?(t1, t2) + @rules.any? do |rule| + rule.operator == SAME_PRIORITY && + ((rule.left_name == t1 && rule.right_name == t2) || + (rule.left_name == t2 && rule.right_name == t1)) + end + end + end + end +end diff --git a/lib/lrama/grammar/symbols/resolver.rb b/lib/lrama/grammar/symbols/resolver.rb index 085a835d..72ab17a1 100644 --- a/lib/lrama/grammar/symbols/resolver.rb +++ b/lib/lrama/grammar/symbols/resolver.rb @@ -52,15 +52,17 @@ def sort_by_number! def add_term(id:, alias_name: nil, tag: nil, token_id: nil, replace: false) if token_id && (sym = find_symbol_by_token_id(token_id)) if replace - sym.id = id - sym.alias_name = alias_name - sym.tag = tag + replace_term_attributes(sym, id: id, alias_name: alias_name, tag: tag, token_id: token_id) end return sym end if (sym = find_symbol_by_id(id)) + if replace + replace_term_attributes(sym, id: id, alias_name: alias_name, tag: tag, token_id: token_id) + end + return sym end @@ -229,6 +231,14 @@ def find_nterm_by_id!(id) end || (raise "Symbol not found. #{id}") end + # @rbs (Grammar::Symbol sym, id: Lexer::Token::Base, ?alias_name: String?, ?tag: Lexer::Token::Tag?, ?token_id: Integer?) -> void + def replace_term_attributes(sym, id:, alias_name: nil, tag: nil, token_id: nil) + sym.id = id + sym.alias_name = alias_name + sym.tag = tag + sym.token_id = token_id if token_id + end + # @rbs () -> void def fill_terms_number # Character literal in grammar file has diff --git a/lib/lrama/grammar/token_pattern.rb b/lib/lrama/grammar/token_pattern.rb new file mode 100644 index 00000000..92e8374e --- /dev/null +++ b/lib/lrama/grammar/token_pattern.rb @@ -0,0 +1,38 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Grammar + # Represents a token pattern defined by %token-pattern directive + # Example: %token-pattern RSHIFT />>/ "right shift" + class TokenPattern + attr_reader :id #: Lexer::Token::Ident + attr_reader :pattern #: Lexer::Token::Regex + attr_reader :alias_name #: String? + attr_reader :tag #: Lexer::Token::Tag? + attr_reader :lineno #: Integer + attr_reader :definition_order #: Integer + + # @rbs (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, ?alias_name: String?, ?tag: Lexer::Token::Tag?, lineno: Integer, definition_order: Integer) -> void + def initialize(id:, pattern:, alias_name: nil, tag: nil, lineno:, definition_order:) + @id = id + @pattern = pattern + @alias_name = alias_name + @tag = tag + @lineno = lineno + @definition_order = definition_order + end + + # @rbs () -> String + def name + @id.s_value + end + + # Returns the regex pattern string (without slashes) + # @rbs () -> String + def regex_pattern + @pattern.pattern + end + end + end +end diff --git a/lib/lrama/length_precedences.rb b/lib/lrama/length_precedences.rb new file mode 100644 index 00000000..15ba218c --- /dev/null +++ b/lib/lrama/length_precedences.rb @@ -0,0 +1,57 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + # Length precedences table for PSLR(1) + # Based on Definition 3.2.15 from the PSLR dissertation + # + # Determines which token should be preferred when there's a length conflict: + # - :left - the shorter token (t1) should be preferred + # - :right - the longer token (t2) should be preferred + # - :undefined - no preference defined, use default (longest match) + class LengthPrecedences + # Result of length precedence lookup + LEFT = :left #: Symbol + RIGHT = :right #: Symbol + UNDEFINED = :undefined #: Symbol + + attr_reader :table #: Hash[[String, String], Symbol] + + # @rbs (Grammar::LexPrec lex_prec) -> void + def initialize(lex_prec) + @table = build_table(lex_prec) + end + + # Get the length precedence between two tokens + # @rbs (String t1, String t2) -> Symbol + def precedence(t1, t2) + @table[[t1, t2]] || UNDEFINED + end + + # Check if t1 (shorter) should be preferred over t2 (longer) + # @rbs (String t1, String t2) -> bool + def prefer_shorter?(t1, t2) + precedence(t1, t2) == LEFT + end + + private + + # Build the length precedence table from lex-prec rules + # @rbs (Grammar::LexPrec lex_prec) -> Hash[[String, String], Symbol] + def build_table(lex_prec) + table = {} + + lex_prec.rules.each do |rule| + case rule.operator + when Grammar::LexPrec::SHORTER + # t1 -s t2: t1 (shorter) should be preferred over t2 (longer) + table[[rule.left_name, rule.right_name]] = LEFT + # Inverse: t2 (longer) should not be preferred over t1 (shorter) + table[[rule.right_name, rule.left_name]] = RIGHT + end + end + + table + end + end +end diff --git a/lib/lrama/lexer.rb b/lib/lrama/lexer.rb index ce98b505..4c4eabc6 100644 --- a/lib/lrama/lexer.rb +++ b/lib/lrama/lexer.rb @@ -18,7 +18,8 @@ class Lexer # [::Symbol, Token::Char] | # [::Symbol, Token::Str] | # [::Symbol, Token::Int] | - # [::Symbol, Token::Ident] + # [::Symbol, Token::Ident] | + # [::Symbol, Token::Regex] # # type c_token = [:C_DECLARATION, Token::UserCode] @@ -32,6 +33,7 @@ class Lexer PERCENT_TOKENS = %w( %union %token + %token-pattern %type %nterm %left @@ -43,6 +45,7 @@ class Lexer %printer %destructor %lex-param + %lex-prec %parse-param %initial-action %precedence @@ -121,7 +124,7 @@ def lex_token return when @scanner.scan(/#{SYMBOLS.join('|')}/) return [@scanner.matched, Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] - when @scanner.scan(/#{PERCENT_TOKENS.join('|')}/) + when @scanner.scan(/#{PERCENT_TOKENS.sort_by { |s| -s.length }.join('|')}/) return [@scanner.matched, Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] when @scanner.scan(/[\?\+\*]/) return [@scanner.matched, Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] @@ -133,6 +136,12 @@ def lex_token return [:CHARACTER, Lrama::Lexer::Token::Char.new(s_value: @scanner.matched, location: location)] when @scanner.scan(/".*?"/) return [:STRING, Lrama::Lexer::Token::Str.new(s_value: %Q(#{@scanner.matched}), location: location)] + when @scanner.scan(%r{/[^/]+/}) + return [:REGEX, Lrama::Lexer::Token::Regex.new(s_value: @scanner.matched, location: location)] + when @scanner.scan(/-s(?=\s)/) + return ['-s', Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] + when @scanner.scan(/-(?=\s)/) + return ['-', Lrama::Lexer::Token::Token.new(s_value: @scanner.matched, location: location)] when @scanner.scan(/\d+/) return [:INTEGER, Lrama::Lexer::Token::Int.new(s_value: Integer(@scanner.matched), location: location)] when @scanner.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)/) diff --git a/lib/lrama/lexer/token.rb b/lib/lrama/lexer/token.rb index 37f77aa0..3932486e 100644 --- a/lib/lrama/lexer/token.rb +++ b/lib/lrama/lexer/token.rb @@ -7,6 +7,7 @@ require_relative 'token/ident' require_relative 'token/instantiate_rule' require_relative 'token/int' +require_relative 'token/regex' require_relative 'token/str' require_relative 'token/tag' require_relative 'token/token' diff --git a/lib/lrama/lexer/token/regex.rb b/lib/lrama/lexer/token/regex.rb new file mode 100644 index 00000000..c4295f40 --- /dev/null +++ b/lib/lrama/lexer/token/regex.rb @@ -0,0 +1,19 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Lexer + module Token + # Token class for regex patterns used in %token-pattern directive + # Example: /[a-zA-Z_][a-zA-Z0-9_]*/ + class Regex < Base + # Returns the regex pattern without the surrounding slashes + # @rbs () -> String + def pattern + # Remove leading and trailing slashes + s_value[1..-2].to_s + end + end + end + end +end diff --git a/lib/lrama/option_parser.rb b/lib/lrama/option_parser.rb index 5a15d59c..29ae759c 100644 --- a/lib/lrama/option_parser.rb +++ b/lib/lrama/option_parser.rb @@ -97,6 +97,7 @@ def parse_by_option_parser(argv) o.on_tail ' lookaheads explicitly associate lookahead tokens to items' o.on_tail ' solved describe shift/reduce conflicts solving' o.on_tail ' counterexamples, cex generate conflict counterexamples' + o.on_tail ' pslr report PSLR split and scanner metrics' o.on_tail ' rules list unused rules' o.on_tail ' terms list unused terminals' o.on_tail ' verbose report detailed internal state and analysis results' @@ -141,7 +142,7 @@ def parse_by_option_parser(argv) end ALIASED_REPORTS = { cex: :counterexamples }.freeze #: Hash[Symbol, Symbol] - VALID_REPORTS = %i[states itemsets lookaheads solved counterexamples rules terms verbose].freeze #: Array[Symbol] + VALID_REPORTS = %i[states itemsets lookaheads solved counterexamples pslr rules terms verbose].freeze #: Array[Symbol] # @rbs (Array[String]) -> Hash[Symbol, bool] def validate_report(report) diff --git a/lib/lrama/output.rb b/lib/lrama/output.rb index 24cf725c..d5e3ff1a 100644 --- a/lib/lrama/output.rb +++ b/lib/lrama/output.rb @@ -401,6 +401,353 @@ def percent_code(name) end.join end + # PSLR Output Helper Methods + # Based on PSLR::OutputHelper - generates PSLR-specific C code + + # Check if the grammar requested PSLR output. + def pslr_enabled? + @grammar.pslr_defined? + end + + # Check if PSLR scanner tables are available. + def pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + !scanner_fsa.nil? && !scanner_fsa.states.empty? + end + + def pslr_function_declarations + return "" unless pslr_enabled? + + declarations = [<<~C_CODE] + int yy_state_accepts_token (int yystate, int yychar); + C_CODE + + if pslr_scanner_enabled? + declarations << <<~C_CODE + int yy_pseudo_scan (int parser_state, const char *input, int *match_length); + C_CODE + + declarations << <<~C_CODE + #define YYPSLR_ENABLED 1 + #define YYPSLR_NO_MATCH YYEMPTY + + #ifndef YYPSLR_PSEUDO_SCAN_STATE + # define YYPSLR_PSEUDO_SCAN_STATE(ParserState, Input, MatchLength) \\ + yy_pseudo_scan ((ParserState), (Input), (MatchLength)) + #endif + C_CODE + end + + if (member = pslr_state_member) + declarations << <<~C_CODE + #ifndef YYGETSTATE_CONTEXT + # define YYGETSTATE_CONTEXT(Context) ((Context)->#{member}) + #endif + C_CODE + + if pslr_scanner_enabled? + declarations << <<~C_CODE + #ifndef YYPSLR_PSEUDO_SCAN + # define YYPSLR_PSEUDO_SCAN(Context, Input, MatchLength) \\ + ((Context) != 0 \\ + ? YYPSLR_PSEUDO_SCAN_STATE (YYGETSTATE_CONTEXT (Context), (Input), (MatchLength)) \\ + : YYEMPTY) + #endif + C_CODE + end + + if !parse_param_name.empty? + declarations << <<~C_CODE + #ifndef YYSETSTATE_CONTEXT + # define YYSETSTATE_CONTEXT(CurrentState) \\ + do { \\ + if (#{parse_param_name} != 0) { \\ + YYGETSTATE_CONTEXT (#{parse_param_name}) = (CurrentState); \\ + } \\ + } while (0) + #endif + C_CODE + end + end + + declarations.join("\n") + end + + def pslr_state_member + member = @grammar.pslr_state_member + member&.strip + end + + def pslr_accepting_states + return [] unless pslr_scanner_enabled? + + @context.states.scanner_fsa.states.select(&:accepting?) + end + + def pslr_token_pattern_count + @context.states.token_patterns.size + end + + def pslr_token_id(token_pattern) + @context.states.find_symbol_by_s_value!(token_pattern.name).token_id + end + + # Generate Scanner FSA transition table as C code + def scanner_transition_table + return "" unless pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + + lines = [] + lines << "/* Scanner FSA transition table */" + lines << "#define YY_SCANNER_NUM_STATES #{scanner_fsa.states.size}" + lines << "#define YY_SCANNER_INVALID_STATE (-1)" + lines << "" + lines << "static const int yy_scanner_transition[YY_SCANNER_NUM_STATES][256] = {" + + scanner_fsa.states.each_with_index do |state, idx| + transitions = Array.new(256, -1) + state.transitions.each do |char, target_id| + transitions[char.ord] = target_id + end + lines << " /* state #{idx} */ {#{transitions.join(', ')}}#{idx < scanner_fsa.states.size - 1 ? ',' : ''}" + end + + lines << "};" + lines.join("\n") + end + + # Generate state_to_accepting table as C code + def state_to_accepting_table + return "" unless pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + accepting_indices = Array.new(scanner_fsa.states.size, -1) + + pslr_accepting_states.each_with_index do |state, index| + accepting_indices[state.id] = index + end + + lines = [] + lines << "" + lines << "/* FSA state -> accepting state index mapping */" + lines << "#define YY_ACCEPTING_NONE (-1)" + lines << "" + lines << "static const int yy_state_to_accepting[YY_SCANNER_NUM_STATES] = {" + lines << " #{accepting_indices.join(', ')}" + lines << "};" + lines.join("\n") + end + + def token_pattern_token_ids_table + return "" unless pslr_scanner_enabled? + + lines = [] + lines << "" + lines << "/* token pattern index -> parser token id */" + lines << "#define YY_PSLR_EMPTY_PATTERN (-1)" + lines << "#define YY_NUM_TOKEN_PATTERNS #{pslr_token_pattern_count}" + lines << "" + lines << "static const int yy_token_pattern_to_token_id[YY_NUM_TOKEN_PATTERNS] = {" + lines << " #{@context.states.token_patterns.map {|token_pattern| pslr_token_id(token_pattern) }.join(', ')}" + lines << "};" + lines.join("\n") + end + + # Generate token IDs for accepting states as C code + def accepting_tokens_table + return "" unless pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + + lines = [] + lines << "" + lines << "/* Accepting state token IDs */" + lines << "/* For each accepting state, list of (token_id, definition_order) pairs */" + lines << "" + + # Collect all unique tokens + all_tokens = @context.states.token_patterns.map(&:name) + lines << "/* Token pattern names: #{all_tokens.join(', ')} */" + lines << "" + + # Generate accepting tokens for each FSA state + scanner_fsa.states.each do |state| + next unless state.accepting? + + token_names = state.accepting_tokens.map(&:name) + lines << "/* State #{state.id} accepts: #{token_names.join(', ')} */" + end + + lines.join("\n") + end + + # Generate scanner_accepts table as C code + def scanner_accepts_table_code + return "" unless pslr_scanner_enabled? + scanner_fsa = @context.states.scanner_fsa + scanner_accepts = @context.states.scanner_accepts_table + return "" unless scanner_accepts + + lines = [] + lines << "" + lines << "/* scanner_accepts[parser_state][accepting_state] -> token pattern index */" + lines << "/* YY_PSLR_EMPTY_PATTERN means no token accepted */" + lines << "" + + num_parser_states = @context.states.states.size + num_accepting_states = pslr_accepting_states.size + + lines << "#define YY_NUM_PARSER_STATES #{num_parser_states}" + lines << "#define YY_NUM_ACCEPTING_STATES #{num_accepting_states}" + lines << "" + + if num_accepting_states > 0 + lines << "static const int yy_scanner_accepts[YY_NUM_PARSER_STATES][YY_NUM_ACCEPTING_STATES] = {" + + @context.states.states.each_with_index do |parser_state, ps_idx| + row = [] + pslr_accepting_states.each do |fsa_state| + token = scanner_accepts[parser_state.id, fsa_state.id] + if token + row << token.definition_order + else + row << -1 + end + end + + lines << " /* parser state #{ps_idx} */ {#{row.join(', ')}}#{ps_idx < num_parser_states - 1 ? ',' : ''}" + end + + lines << "};" + end + + lines.join("\n") + end + + # Generate length_precedences table as C code + def length_precedences_table_code + return "" unless pslr_scanner_enabled? + length_precedences = @context.states.length_precedences + return "" unless length_precedences + + lines = [] + lines << "" + lines << "/* length_precedences[token1][token2] -> precedence */" + lines << "#define YY_LENGTH_PREC_UNDEFINED 0" + lines << "#define YY_LENGTH_PREC_LEFT 1 /* shorter token wins */" + lines << "#define YY_LENGTH_PREC_RIGHT 2 /* longer token wins */" + lines << "" + + num_tokens = pslr_token_pattern_count + if num_tokens > 0 + lines << "static const int yy_length_precedences[#{num_tokens}][#{num_tokens}] = {" + + @context.states.token_patterns.each_with_index do |t1, i| + row = @context.states.token_patterns.map do |t2| + case length_precedences.precedence(t1.name, t2.name) + when :left then 1 + when :right then 2 + else 0 + end + end + lines << " /* #{t1.name} */ {#{row.join(', ')}}#{i < num_tokens - 1 ? ',' : ''}" + end + + lines << "};" + end + + lines.join("\n") + end + + # Generate pseudo_scan function as C code + def pseudo_scan_function + return "" unless pslr_scanner_enabled? + + <<~C_CODE + + /* + * pseudo_scan: PSLR(1) scanning function + * Based on Definition 3.2.16 from the PSLR dissertation + * + * Input: + * parser_state: Current parser state + * input: Input buffer pointer + * match_length: Output parameter for matched length + * + * Returns: Selected parser token ID, or YYEMPTY if no match + */ + int + yy_pseudo_scan(int parser_state, const char *input, int *match_length) + { + int local_match_length = 0; + int ss = 0; /* FSA initial state */ + int ibest = 0; + int pbest = YY_PSLR_EMPTY_PATTERN; + int i = 0; + + if (match_length == NULL) { + match_length = &local_match_length; + } + + *match_length = 0; + + if (parser_state < 0 || parser_state >= YY_NUM_PARSER_STATES || input == NULL) { + return YYEMPTY; + } + + while (input[i] != '\\0') { + int c = (unsigned char)input[i]; + int next_ss = yy_scanner_transition[ss][c]; + + if (next_ss == YY_SCANNER_INVALID_STATE) { + break; + } + + ss = next_ss; + i++; + + /* Check if this is an accepting state */ + int sa = yy_state_to_accepting[ss]; + if (sa != YY_ACCEPTING_NONE) { + int pattern_index = yy_scanner_accepts[parser_state][sa]; + if (pattern_index != YY_PSLR_EMPTY_PATTERN) { + /* Check length precedences */ + if (pbest == YY_PSLR_EMPTY_PATTERN || + (i > ibest && yy_length_precedences[pbest][pattern_index] != YY_LENGTH_PREC_LEFT) || + (i == ibest && yy_length_precedences[pattern_index][pbest] == YY_LENGTH_PREC_LEFT)) { + pbest = pattern_index; + ibest = i; + } + } + } + } + + *match_length = ibest; + if (pbest == YY_PSLR_EMPTY_PATTERN) { + return YYEMPTY; + } + + return yy_token_pattern_to_token_id[pbest]; + } + C_CODE + end + + # Generate all PSLR C code + def pslr_tables_and_functions + return "" unless pslr_scanner_enabled? + + [ + "/* PSLR(1) Scanner Tables and Functions */", + "/* Generated by Lrama PSLR implementation */", + "", + scanner_transition_table, + state_to_accepting_table, + token_pattern_token_ids_table, + accepting_tokens_table, + scanner_accepts_table_code, + length_precedences_table_code, + pseudo_scan_function + ].join("\n") + end + private def eval_template(file, path) diff --git a/lib/lrama/parser.rb b/lib/lrama/parser.rb index 04632cba..787242b7 100644 --- a/lib/lrama/parser.rb +++ b/lib/lrama/parser.rb @@ -655,7 +655,7 @@ def token_to_str(t) module Lrama class Parser < Racc::Parser -module_eval(<<'...end parser.y/module_eval...', 'parser.y', 505) +module_eval(<<'...end parser.y/module_eval...', 'parser.y', 577) include Lrama::Tracer::Duration @@ -745,325 +745,352 @@ def raise_parse_error(error_message, location) ##### State transition tables begin ### racc_action_table = [ - 98, 98, 99, 99, 87, 53, 53, 52, 178, 110, - 110, 97, 53, 53, 184, 178, 110, 110, 53, 181, - 184, 162, 110, 6, 163, 181, 181, 53, 53, 52, - 52, 181, 79, 79, 53, 53, 52, 52, 43, 79, - 79, 53, 4, 52, 5, 110, 88, 94, 182, 125, - 126, 163, 100, 100, 180, 193, 194, 195, 137, 185, - 188, 180, 4, 44, 5, 185, 188, 94, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 46, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 47, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 47, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 12, 13, 50, - 57, 14, 15, 16, 17, 18, 19, 20, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 57, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 12, 13, 57, - 60, 14, 15, 16, 17, 18, 19, 20, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 57, 33, 34, - 35, 36, 37, 38, 39, 40, 41, 53, 53, 52, - 52, 110, 105, 53, 53, 52, 52, 110, 105, 53, - 53, 52, 52, 110, 105, 53, 53, 52, 52, 110, - 105, 53, 53, 52, 52, 110, 110, 53, 53, 52, - 209, 110, 110, 53, 53, 209, 225, 110, 110, 53, - 53, 209, 209, 110, 110, 193, 194, 195, 137, 216, - 222, 232, 217, 217, 217, 235, 57, 53, 217, 52, - 53, 53, 52, 52, 193, 194, 195, 57, 57, 57, - 66, 67, 68, 69, 70, 72, 72, 72, 86, 89, - 47, 57, 57, 113, 117, 117, 79, 123, 124, 131, - 47, 133, 137, 139, 143, 149, 150, 151, 152, 133, - 155, 156, 157, 110, 166, 149, 169, 172, 173, 72, - 175, 176, 183, 189, 166, 196, 137, 200, 202, 137, - 166, 211, 166, 137, 72, 176, 218, 176, 72, 137, - 228, 137, 72, 231, 72 ] + 105, 105, 106, 106, 94, 4, 55, 5, 200, 55, + 117, 206, 104, 117, 55, 6, 200, 55, 117, 206, + 203, 117, 55, 203, 54, 141, 142, 86, 203, 45, + 55, 203, 54, 46, 55, 86, 54, 48, 55, 86, + 54, 49, 55, 86, 54, 185, 117, 112, 49, 101, + 184, 55, 95, 54, 52, 117, 112, 107, 107, 202, + 185, 207, 210, 239, 4, 204, 5, 202, 238, 207, + 210, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 101, 33, 34, 35, 36, 37, 38, 39, 40, 41, + 42, 55, 239, 54, 43, 117, 112, 244, 215, 216, + 217, 153, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 59, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 12, 13, 59, 43, 14, 15, 16, 17, + 18, 19, 20, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 59, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 12, 13, 62, 43, 14, 15, 16, + 17, 18, 19, 20, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 59, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 55, 239, 54, 43, 117, 112, + 254, 239, 55, 55, 54, 54, 257, 117, 55, 55, + 54, 54, 117, 117, 55, 55, 54, 54, 117, 117, + 55, 55, 231, 231, 117, 117, 55, 55, 247, 231, + 117, 117, 55, 55, 231, 54, 117, 215, 216, 217, + 153, 129, 130, 131, 129, 130, 131, 55, 55, 54, + 54, 55, 55, 54, 54, 55, 59, 54, 215, 216, + 217, 59, 59, 59, 68, 69, 70, 71, 72, 74, + 74, 80, 74, 74, 93, 96, 49, 59, 59, 120, + 124, 127, 133, 133, 86, 139, 140, 147, 49, 149, + 153, 155, 159, 127, 127, 163, 164, 165, 170, 171, + 172, 173, 149, 176, 177, 178, 117, 117, 188, 170, + 191, 194, 195, 74, 197, 198, 205, 211, 188, 218, + 153, 222, 224, 153, 188, 233, 188, 153, 74, 198, + 240, 198, 74, 153, 250, 153, 74, 253, 74 ] racc_action_check = [ - 51, 97, 51, 97, 41, 75, 165, 75, 165, 75, - 165, 51, 171, 190, 171, 190, 171, 190, 201, 165, - 201, 148, 201, 1, 148, 171, 190, 36, 37, 36, - 37, 201, 36, 37, 38, 39, 38, 39, 5, 38, - 39, 117, 0, 117, 0, 117, 41, 46, 168, 88, - 88, 168, 51, 97, 165, 177, 177, 177, 177, 171, - 171, 190, 2, 6, 2, 201, 201, 90, 46, 46, - 46, 46, 46, 46, 46, 46, 46, 9, 46, 46, - 46, 46, 46, 46, 46, 46, 46, 10, 90, 90, - 90, 90, 90, 90, 90, 90, 90, 11, 90, 90, - 90, 90, 90, 90, 90, 90, 90, 3, 3, 12, - 14, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 15, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 8, 8, 16, - 17, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 18, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 63, 13, 63, - 13, 63, 63, 64, 73, 64, 73, 64, 64, 65, - 78, 65, 78, 65, 65, 106, 79, 106, 79, 106, - 106, 118, 180, 118, 180, 118, 180, 188, 196, 188, - 196, 188, 196, 202, 217, 202, 217, 202, 217, 218, - 231, 218, 231, 218, 231, 186, 186, 186, 186, 208, - 213, 227, 208, 213, 227, 234, 24, 113, 234, 113, - 114, 123, 114, 123, 210, 210, 210, 25, 26, 27, - 28, 29, 30, 31, 32, 33, 34, 35, 40, 42, - 47, 55, 60, 71, 74, 76, 80, 81, 87, 91, - 92, 93, 94, 102, 116, 124, 125, 126, 127, 133, - 136, 137, 138, 144, 150, 151, 153, 156, 158, 162, - 163, 164, 170, 174, 176, 178, 179, 182, 184, 187, - 189, 199, 200, 204, 205, 207, 209, 212, 214, 216, - 221, 222, 224, 225, 229 ] + 53, 104, 53, 104, 43, 0, 187, 0, 187, 193, + 187, 193, 53, 193, 212, 1, 212, 223, 212, 223, + 187, 223, 38, 193, 38, 95, 95, 38, 212, 5, + 39, 223, 39, 6, 40, 39, 40, 9, 41, 40, + 41, 10, 65, 41, 65, 169, 65, 65, 11, 48, + 169, 66, 43, 66, 12, 66, 66, 53, 104, 187, + 190, 193, 193, 230, 2, 190, 2, 212, 230, 223, + 223, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 97, 48, 48, 48, 48, 48, 48, 48, 48, 48, + 48, 67, 235, 67, 48, 67, 67, 235, 199, 199, + 199, 199, 97, 97, 97, 97, 97, 97, 97, 97, + 97, 14, 97, 97, 97, 97, 97, 97, 97, 97, + 97, 97, 3, 3, 15, 97, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 16, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 8, 8, 17, 3, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 18, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 113, 249, 113, 8, 113, 113, + 249, 256, 13, 82, 13, 82, 256, 82, 133, 134, + 133, 134, 133, 134, 202, 210, 202, 210, 202, 210, + 218, 224, 218, 224, 218, 224, 239, 240, 239, 240, + 239, 240, 253, 75, 253, 75, 253, 208, 208, 208, + 208, 79, 79, 79, 80, 80, 80, 85, 86, 85, + 86, 120, 121, 120, 121, 139, 24, 139, 232, 232, + 232, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 42, 44, 49, 57, 62, 73, + 76, 77, 81, 83, 87, 88, 94, 98, 99, 100, + 101, 109, 123, 124, 125, 127, 128, 132, 140, 141, + 142, 143, 149, 152, 153, 154, 160, 163, 171, 172, + 174, 177, 179, 184, 185, 186, 192, 196, 198, 200, + 201, 204, 206, 209, 211, 221, 222, 226, 227, 229, + 231, 234, 236, 238, 243, 244, 246, 247, 251 ] racc_action_pointer = [ - 32, 23, 52, 93, nil, 31, 63, nil, 123, 68, - 74, 84, 103, 165, 94, 111, 123, 135, 141, nil, - nil, nil, nil, nil, 210, 221, 222, 223, 235, 236, - 237, 238, 239, 237, 238, 239, 24, 25, 31, 32, - 243, -1, 247, nil, nil, nil, 43, 237, nil, nil, - nil, -5, nil, nil, nil, 235, nil, nil, nil, nil, - 236, nil, nil, 164, 170, 176, nil, nil, nil, nil, - nil, 245, nil, 171, 246, 2, 247, nil, 177, 183, - 248, 249, nil, nil, nil, nil, nil, 214, 45, nil, - 63, 250, 247, 248, 207, nil, nil, -4, nil, nil, - nil, nil, 261, nil, nil, nil, 182, nil, nil, nil, - nil, nil, nil, 224, 227, nil, 258, 38, 188, nil, - nil, nil, nil, 228, 260, 220, 223, 257, nil, nil, - nil, nil, nil, 256, nil, nil, 224, 266, 255, nil, - nil, nil, nil, nil, 266, nil, nil, nil, -24, nil, - 224, 270, nil, 274, nil, nil, 221, nil, 261, nil, - nil, nil, 271, 275, 232, 3, nil, nil, 3, nil, - 233, 9, nil, nil, 237, nil, 234, 3, 241, 231, - 189, nil, 241, nil, 244, nil, 163, 234, 194, 240, - 10, nil, nil, nil, nil, nil, 195, nil, nil, 289, - 242, 15, 200, nil, 238, 286, nil, 246, 174, 252, - 182, nil, 248, 175, 290, nil, 244, 201, 206, nil, - nil, 283, 246, nil, 294, 259, nil, 176, nil, 296, - nil, 207, nil, nil, 180, nil ] + -6, 15, 53, 107, nil, 22, 33, nil, 138, 27, + 27, 34, 48, 189, 94, 107, 125, 150, 156, nil, + nil, nil, nil, nil, 229, 234, 235, 236, 249, 250, + 251, 252, 253, 251, 252, 256, 254, 255, 19, 27, + 31, 35, 259, -1, 263, nil, nil, nil, 45, 252, + nil, nil, nil, -5, nil, nil, nil, 250, nil, nil, + nil, nil, 251, nil, nil, 39, 48, 88, nil, nil, + nil, nil, nil, 261, nil, 220, 262, 266, nil, 185, + 188, 264, 190, 265, nil, 234, 235, 266, 267, nil, + nil, nil, nil, nil, 226, 21, nil, 76, 267, 264, + 265, 220, nil, nil, -4, nil, nil, nil, nil, 279, + nil, nil, nil, 181, nil, nil, nil, nil, nil, nil, + 238, 239, nil, 276, 278, 279, nil, 276, 281, nil, + nil, nil, 282, 195, 196, nil, nil, nil, nil, 242, + 283, 237, 240, 279, nil, nil, nil, nil, nil, 278, + nil, nil, 241, 289, 277, nil, nil, nil, nil, nil, + 289, nil, nil, 290, nil, nil, nil, nil, nil, -1, + nil, 243, 294, nil, 298, nil, nil, 240, nil, 284, + nil, nil, nil, nil, 295, 299, 251, 3, nil, nil, + 14, nil, 252, 6, nil, nil, 255, nil, 253, 41, + 259, 250, 201, nil, 259, nil, 262, nil, 170, 253, + 202, 259, 11, nil, nil, nil, nil, nil, 207, nil, + nil, 313, 261, 14, 208, nil, 257, 310, nil, 265, + 17, 270, 191, nil, 267, 46, 314, nil, 263, 213, + 214, nil, nil, 306, 265, nil, 318, 277, nil, 139, + nil, 320, nil, 219, nil, nil, 145, nil ] racc_action_default = [ - -1, -137, -1, -3, -10, -137, -137, -2, -3, -137, - -14, -14, -137, -137, -137, -137, -137, -137, -137, -28, - -29, -34, -35, -36, -137, -137, -137, -137, -137, -137, - -137, -137, -137, -54, -54, -54, -137, -137, -137, -137, - -137, -137, -137, -13, 236, -4, -137, -14, -16, -17, - -20, -132, -100, -101, -131, -18, -23, -89, -24, -25, - -137, -27, -37, -137, -137, -137, -41, -42, -43, -44, - -45, -46, -55, -137, -47, -137, -48, -49, -92, -137, - -95, -97, -98, -50, -51, -52, -53, -137, -137, -11, - -5, -7, -14, -137, -72, -15, -21, -132, -133, -134, - -135, -19, -137, -26, -30, -31, -32, -38, -87, -88, - -136, -39, -40, -137, -56, -58, -60, -137, -83, -85, - -93, -94, -96, -137, -137, -137, -137, -137, -6, -8, - -9, -129, -104, -102, -105, -73, -137, -137, -137, -90, - -33, -59, -57, -61, -80, -86, -84, -99, -137, -66, - -70, -137, -12, -137, -103, -109, -137, -22, -137, -62, - -81, -82, -54, -137, -64, -68, -71, -74, -137, -130, - -106, -107, -128, -91, -137, -67, -70, -72, -100, -72, - -137, -125, -137, -109, -100, -110, -72, -72, -137, -70, - -69, -75, -76, -116, -117, -118, -137, -78, -79, -137, - -70, -108, -137, -111, -72, -54, -115, -63, -137, -100, - -119, -126, -65, -137, -54, -114, -72, -137, -137, -120, - -121, -137, -72, -112, -54, -100, -122, -137, -127, -54, - -77, -137, -124, -113, -137, -123 ] + -1, -150, -1, -3, -10, -150, -150, -2, -3, -150, + -14, -14, -150, -150, -150, -150, -150, -150, -150, -28, + -29, -34, -35, -36, -150, -150, -150, -150, -150, -150, + -150, -150, -150, -56, -56, -150, -56, -56, -150, -150, + -150, -150, -150, -150, -150, -13, 258, -4, -150, -14, + -16, -17, -20, -145, -113, -114, -144, -18, -23, -102, + -24, -25, -150, -27, -37, -150, -150, -150, -41, -42, + -43, -44, -45, -46, -57, -150, -47, -150, -48, -70, + -150, -49, -150, -50, -51, -105, -150, -108, -110, -111, + -52, -53, -54, -55, -150, -150, -11, -5, -7, -14, + -150, -85, -15, -21, -145, -146, -147, -148, -19, -150, + -26, -30, -31, -32, -38, -100, -101, -149, -39, -40, + -150, -58, -60, -62, -150, -65, -67, -150, -150, -73, + -74, -75, -150, -150, -96, -98, -106, -107, -109, -150, + -150, -150, -150, -150, -6, -8, -9, -142, -117, -115, + -118, -86, -150, -150, -150, -103, -33, -61, -59, -63, + -93, -68, -66, -93, -72, -71, -99, -97, -112, -150, + -79, -83, -150, -12, -150, -116, -122, -150, -22, -150, + -64, -94, -95, -69, -56, -150, -77, -81, -84, -87, + -150, -143, -119, -120, -141, -104, -150, -80, -83, -85, + -113, -85, -150, -138, -150, -122, -113, -123, -85, -85, + -150, -83, -82, -88, -89, -129, -130, -131, -150, -91, + -92, -150, -83, -121, -150, -124, -85, -56, -128, -76, + -150, -113, -132, -139, -78, -150, -56, -127, -85, -150, + -150, -133, -134, -150, -85, -125, -56, -113, -135, -150, + -140, -56, -90, -150, -137, -126, -150, -136 ] racc_goto_table = [ - 73, 118, 136, 54, 48, 49, 164, 96, 91, 120, - 121, 93, 187, 208, 107, 111, 112, 119, 134, 213, - 56, 58, 59, 171, 61, 1, 78, 78, 78, 78, - 62, 63, 64, 65, 115, 227, 129, 192, 148, 74, - 76, 95, 187, 118, 118, 207, 204, 3, 234, 7, - 130, 201, 128, 138, 147, 93, 212, 140, 154, 145, - 146, 101, 9, 116, 42, 168, 103, 45, 78, 78, - 219, 127, 51, 71, 141, 142, 77, 83, 84, 85, - 159, 144, 190, 160, 161, 191, 132, 197, 102, 158, - 122, 177, 170, 220, 203, 205, 199, 186, 221, 153, - nil, nil, nil, 116, 116, nil, 198, nil, nil, nil, - nil, nil, 214, 78, 206, nil, 177, nil, nil, nil, - nil, nil, 210, nil, 224, nil, nil, 186, 210, 174, - 229, nil, nil, nil, nil, nil, nil, nil, nil, nil, - nil, nil, nil, 226, 210, nil, nil, nil, nil, nil, - nil, nil, nil, nil, nil, nil, nil, 210, nil, nil, + 75, 77, 134, 152, 50, 51, 135, 56, 136, 137, + 103, 98, 186, 169, 209, 100, 150, 193, 114, 118, + 119, 9, 122, 126, 180, 181, 47, 183, 181, 3, + 63, 7, 85, 85, 85, 85, 64, 65, 66, 67, + 58, 60, 61, 102, 209, 190, 223, 81, 83, 1, + 230, 145, 229, 134, 134, 146, 235, 166, 167, 44, + 144, 154, 168, 234, 100, 175, 156, 157, 158, 123, + 161, 162, 249, 214, 110, 84, 90, 91, 92, 85, + 85, 143, 226, 108, 53, 256, 128, 132, 73, 76, + 78, 160, 79, 148, 212, 109, 179, 138, 192, 242, + 221, 213, 243, 219, 174, nil, 241, 199, nil, nil, + 225, 227, nil, 208, 123, 123, nil, nil, nil, nil, + nil, nil, 220, nil, nil, nil, nil, nil, 236, nil, + 228, nil, 199, 85, nil, nil, nil, nil, 232, nil, + 246, nil, nil, 208, 232, nil, 251, nil, nil, nil, + nil, 196, nil, nil, nil, nil, nil, nil, nil, 248, + 232, nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, nil, nil, 232, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, - nil, nil, 215, nil, nil, nil, nil, nil, nil, nil, - nil, 223, nil, nil, nil, nil, nil, nil, nil, nil, - nil, 230, nil, nil, nil, nil, 233 ] + nil, nil, nil, nil, 237, nil, nil, nil, nil, nil, + nil, nil, nil, 245, nil, nil, nil, nil, nil, nil, + nil, nil, nil, 252, nil, nil, nil, nil, 255 ] racc_goto_check = [ - 29, 22, 42, 31, 14, 14, 35, 16, 8, 48, - 48, 13, 40, 39, 24, 24, 24, 45, 52, 39, - 18, 18, 18, 54, 17, 1, 31, 31, 31, 31, - 17, 17, 17, 17, 30, 39, 5, 38, 34, 26, - 26, 14, 40, 22, 22, 35, 38, 6, 39, 6, - 9, 54, 8, 16, 48, 13, 35, 24, 52, 45, - 45, 18, 7, 31, 10, 34, 17, 7, 31, 31, - 38, 11, 15, 25, 30, 30, 27, 27, 27, 27, - 32, 33, 36, 43, 44, 42, 14, 42, 46, 47, - 50, 22, 53, 55, 42, 42, 56, 22, 57, 58, - nil, nil, nil, 31, 31, nil, 22, nil, nil, nil, - nil, nil, 42, 31, 22, nil, 22, nil, nil, nil, - nil, nil, 22, nil, 42, nil, nil, 22, 22, 29, - 42, nil, nil, nil, nil, nil, nil, nil, nil, nil, - nil, nil, nil, 22, 22, nil, nil, nil, nil, nil, - nil, nil, nil, nil, nil, nil, nil, 22, nil, nil, + 31, 31, 22, 48, 14, 14, 51, 33, 54, 54, + 16, 8, 41, 40, 46, 13, 58, 60, 24, 24, + 24, 7, 32, 37, 34, 49, 7, 34, 49, 6, + 17, 6, 33, 33, 33, 33, 17, 17, 17, 17, + 18, 18, 18, 14, 46, 40, 60, 28, 28, 1, + 45, 5, 41, 22, 22, 9, 45, 51, 51, 10, + 8, 16, 54, 41, 13, 58, 24, 32, 32, 33, + 37, 37, 45, 44, 17, 29, 29, 29, 29, 33, + 33, 11, 44, 18, 15, 45, 39, 39, 25, 26, + 27, 35, 38, 14, 42, 52, 53, 56, 59, 61, + 62, 48, 63, 48, 64, nil, 44, 22, nil, nil, + 48, 48, nil, 22, 33, 33, nil, nil, nil, nil, + nil, nil, 22, nil, nil, nil, nil, nil, 48, nil, + 22, nil, 22, 33, nil, nil, nil, nil, 22, nil, + 48, nil, nil, 22, 22, nil, 48, nil, nil, nil, + nil, 31, nil, nil, nil, nil, nil, nil, nil, 22, + 22, nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, nil, nil, 22, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, - nil, nil, 29, nil, nil, nil, nil, nil, nil, nil, - nil, 29, nil, nil, nil, nil, nil, nil, nil, nil, - nil, 29, nil, nil, nil, nil, 29 ] + nil, nil, nil, nil, 31, nil, nil, nil, nil, nil, + nil, nil, nil, 31, nil, nil, nil, nil, nil, nil, + nil, nil, nil, 31, nil, nil, nil, nil, 31 ] racc_goto_pointer = [ - nil, 25, nil, nil, nil, -55, 47, 59, -38, -41, - 60, -18, nil, -35, -6, 59, -44, 6, 6, nil, - nil, nil, -74, nil, -49, 40, 5, 40, nil, -33, - -39, -10, -64, -35, -86, -144, -94, nil, -140, -183, - -159, nil, -92, -61, -60, -58, 31, -50, -69, nil, - 10, nil, -75, -63, -132, -117, -85, -113, -32 ] + nil, 49, nil, nil, nil, -47, 29, 18, -37, -43, + 55, -15, nil, -33, -6, 71, -43, 12, 26, nil, + nil, nil, -80, nil, -47, 55, 55, 55, 11, 37, + nil, -33, -53, -6, -136, -32, nil, -54, 57, 7, + -127, -159, -104, nil, -126, -168, -179, nil, -98, -135, + nil, -76, 36, -59, -77, nil, 10, nil, -84, -78, + -159, -133, -103, -131, -43 ] racc_goto_default = [ - nil, nil, 2, 8, 90, nil, nil, nil, nil, nil, - nil, nil, 10, 11, nil, nil, nil, 55, nil, 21, - 22, 23, 104, 106, nil, nil, nil, nil, 114, 75, - nil, 108, nil, nil, nil, nil, 165, 135, nil, nil, - 179, 167, nil, 109, nil, nil, nil, nil, 81, 80, - 82, 92, nil, nil, nil, nil, nil, nil, nil ] + nil, nil, 2, 8, 97, nil, nil, nil, nil, nil, + nil, nil, 10, 11, nil, nil, nil, 57, nil, 21, + 22, 23, 111, 113, nil, nil, nil, nil, nil, nil, + 121, 82, nil, 115, nil, nil, 125, nil, nil, nil, + nil, nil, 187, 151, nil, nil, 201, 189, nil, 116, + 182, nil, nil, nil, 88, 87, 89, 99, nil, nil, + nil, nil, nil, nil, nil ] racc_reduce_table = [ 0, 0, :racc_error, - 0, 64, :_reduce_1, - 2, 64, :_reduce_2, - 0, 65, :_reduce_3, - 2, 65, :_reduce_4, - 1, 66, :_reduce_5, - 2, 66, :_reduce_6, - 0, 67, :_reduce_none, - 1, 67, :_reduce_none, - 5, 59, :_reduce_none, - 0, 68, :_reduce_10, - 0, 69, :_reduce_11, - 5, 60, :_reduce_12, - 2, 60, :_reduce_13, - 0, 72, :_reduce_14, - 2, 72, :_reduce_15, - 2, 61, :_reduce_none, - 2, 61, :_reduce_none, - 1, 76, :_reduce_18, - 2, 76, :_reduce_19, - 2, 70, :_reduce_20, - 3, 70, :_reduce_21, - 5, 70, :_reduce_22, - 2, 70, :_reduce_none, - 2, 70, :_reduce_24, - 2, 70, :_reduce_25, - 3, 70, :_reduce_26, - 2, 70, :_reduce_27, - 1, 70, :_reduce_28, - 1, 70, :_reduce_29, - 1, 81, :_reduce_30, - 1, 81, :_reduce_31, - 1, 82, :_reduce_32, - 2, 82, :_reduce_33, - 1, 71, :_reduce_none, - 1, 71, :_reduce_none, - 1, 71, :_reduce_none, - 2, 71, :_reduce_37, - 3, 71, :_reduce_38, - 3, 71, :_reduce_39, - 3, 71, :_reduce_40, - 2, 71, :_reduce_41, - 2, 71, :_reduce_42, - 2, 71, :_reduce_43, - 2, 71, :_reduce_44, - 2, 71, :_reduce_45, - 2, 77, :_reduce_none, - 2, 77, :_reduce_47, - 2, 77, :_reduce_48, - 2, 77, :_reduce_49, - 2, 77, :_reduce_50, - 2, 77, :_reduce_51, - 2, 77, :_reduce_52, - 2, 77, :_reduce_53, - 0, 87, :_reduce_none, - 1, 87, :_reduce_none, - 1, 88, :_reduce_56, - 2, 88, :_reduce_57, - 2, 83, :_reduce_58, - 3, 83, :_reduce_59, - 0, 91, :_reduce_none, - 1, 91, :_reduce_none, - 3, 86, :_reduce_62, - 8, 78, :_reduce_63, - 5, 79, :_reduce_64, - 8, 79, :_reduce_65, - 1, 92, :_reduce_66, - 3, 92, :_reduce_67, - 1, 93, :_reduce_68, - 3, 93, :_reduce_69, - 0, 99, :_reduce_none, - 1, 99, :_reduce_none, - 0, 100, :_reduce_none, - 1, 100, :_reduce_none, - 1, 94, :_reduce_74, - 3, 94, :_reduce_75, - 3, 94, :_reduce_76, - 7, 94, :_reduce_77, - 3, 94, :_reduce_78, - 3, 94, :_reduce_79, - 0, 102, :_reduce_none, - 1, 102, :_reduce_none, - 1, 90, :_reduce_82, - 1, 103, :_reduce_83, - 2, 103, :_reduce_84, - 2, 84, :_reduce_85, - 3, 84, :_reduce_86, - 1, 80, :_reduce_none, - 1, 80, :_reduce_none, - 0, 104, :_reduce_89, - 0, 105, :_reduce_90, - 5, 75, :_reduce_91, - 1, 106, :_reduce_92, - 2, 106, :_reduce_93, - 2, 107, :_reduce_94, - 1, 108, :_reduce_95, - 2, 108, :_reduce_96, - 1, 85, :_reduce_97, - 1, 85, :_reduce_98, - 3, 85, :_reduce_99, - 1, 89, :_reduce_none, - 1, 89, :_reduce_none, - 1, 110, :_reduce_102, - 2, 110, :_reduce_103, - 2, 62, :_reduce_none, - 2, 62, :_reduce_none, - 4, 109, :_reduce_106, - 1, 111, :_reduce_107, - 3, 111, :_reduce_108, - 0, 112, :_reduce_109, - 2, 112, :_reduce_110, - 3, 112, :_reduce_111, - 5, 112, :_reduce_112, - 7, 112, :_reduce_113, - 4, 112, :_reduce_114, - 3, 112, :_reduce_115, - 1, 96, :_reduce_116, - 1, 96, :_reduce_117, - 1, 96, :_reduce_118, + 0, 69, :_reduce_1, + 2, 69, :_reduce_2, + 0, 70, :_reduce_3, + 2, 70, :_reduce_4, + 1, 71, :_reduce_5, + 2, 71, :_reduce_6, + 0, 72, :_reduce_none, + 1, 72, :_reduce_none, + 5, 64, :_reduce_none, + 0, 73, :_reduce_10, + 0, 74, :_reduce_11, + 5, 65, :_reduce_12, + 2, 65, :_reduce_13, + 0, 77, :_reduce_14, + 2, 77, :_reduce_15, + 2, 66, :_reduce_none, + 2, 66, :_reduce_none, + 1, 81, :_reduce_18, + 2, 81, :_reduce_19, + 2, 75, :_reduce_20, + 3, 75, :_reduce_21, + 5, 75, :_reduce_22, + 2, 75, :_reduce_none, + 2, 75, :_reduce_24, + 2, 75, :_reduce_25, + 3, 75, :_reduce_26, + 2, 75, :_reduce_27, + 1, 75, :_reduce_28, + 1, 75, :_reduce_29, + 1, 86, :_reduce_30, + 1, 86, :_reduce_31, + 1, 87, :_reduce_32, + 2, 87, :_reduce_33, + 1, 76, :_reduce_none, + 1, 76, :_reduce_none, + 1, 76, :_reduce_none, + 2, 76, :_reduce_37, + 3, 76, :_reduce_38, + 3, 76, :_reduce_39, + 3, 76, :_reduce_40, + 2, 76, :_reduce_41, + 2, 76, :_reduce_42, + 2, 76, :_reduce_43, + 2, 76, :_reduce_44, + 2, 76, :_reduce_45, + 2, 82, :_reduce_none, + 2, 82, :_reduce_none, + 2, 82, :_reduce_none, + 2, 82, :_reduce_49, + 2, 82, :_reduce_50, + 2, 82, :_reduce_51, + 2, 82, :_reduce_52, + 2, 82, :_reduce_53, + 2, 82, :_reduce_54, + 2, 82, :_reduce_55, + 0, 94, :_reduce_none, + 1, 94, :_reduce_none, + 1, 95, :_reduce_58, + 2, 95, :_reduce_59, + 2, 88, :_reduce_60, + 3, 88, :_reduce_61, + 0, 98, :_reduce_none, + 1, 98, :_reduce_none, + 3, 93, :_reduce_64, + 1, 100, :_reduce_65, + 2, 100, :_reduce_66, + 2, 89, :_reduce_67, + 3, 89, :_reduce_68, + 3, 99, :_reduce_69, + 1, 90, :_reduce_70, + 3, 101, :_reduce_71, + 3, 101, :_reduce_72, + 1, 102, :_reduce_73, + 1, 102, :_reduce_74, + 1, 102, :_reduce_75, + 8, 83, :_reduce_76, + 5, 84, :_reduce_77, + 8, 84, :_reduce_78, + 1, 103, :_reduce_79, + 3, 103, :_reduce_80, + 1, 104, :_reduce_81, + 3, 104, :_reduce_82, + 0, 110, :_reduce_none, + 1, 110, :_reduce_none, + 0, 111, :_reduce_none, + 1, 111, :_reduce_none, + 1, 105, :_reduce_87, + 3, 105, :_reduce_88, + 3, 105, :_reduce_89, + 7, 105, :_reduce_90, + 3, 105, :_reduce_91, + 3, 105, :_reduce_92, 0, 113, :_reduce_none, 1, 113, :_reduce_none, - 2, 97, :_reduce_121, - 3, 97, :_reduce_122, - 6, 97, :_reduce_123, - 4, 97, :_reduce_124, - 0, 114, :_reduce_125, - 0, 115, :_reduce_126, - 5, 98, :_reduce_127, - 3, 95, :_reduce_128, - 0, 116, :_reduce_129, - 3, 63, :_reduce_130, - 1, 73, :_reduce_none, - 0, 74, :_reduce_none, - 1, 74, :_reduce_none, - 1, 74, :_reduce_none, - 1, 74, :_reduce_none, - 1, 101, :_reduce_136 ] - -racc_reduce_n = 137 - -racc_shift_n = 236 + 1, 97, :_reduce_95, + 1, 114, :_reduce_96, + 2, 114, :_reduce_97, + 2, 91, :_reduce_98, + 3, 91, :_reduce_99, + 1, 85, :_reduce_none, + 1, 85, :_reduce_none, + 0, 115, :_reduce_102, + 0, 116, :_reduce_103, + 5, 80, :_reduce_104, + 1, 117, :_reduce_105, + 2, 117, :_reduce_106, + 2, 118, :_reduce_107, + 1, 119, :_reduce_108, + 2, 119, :_reduce_109, + 1, 92, :_reduce_110, + 1, 92, :_reduce_111, + 3, 92, :_reduce_112, + 1, 96, :_reduce_none, + 1, 96, :_reduce_none, + 1, 121, :_reduce_115, + 2, 121, :_reduce_116, + 2, 67, :_reduce_none, + 2, 67, :_reduce_none, + 4, 120, :_reduce_119, + 1, 122, :_reduce_120, + 3, 122, :_reduce_121, + 0, 123, :_reduce_122, + 2, 123, :_reduce_123, + 3, 123, :_reduce_124, + 5, 123, :_reduce_125, + 7, 123, :_reduce_126, + 4, 123, :_reduce_127, + 3, 123, :_reduce_128, + 1, 107, :_reduce_129, + 1, 107, :_reduce_130, + 1, 107, :_reduce_131, + 0, 124, :_reduce_none, + 1, 124, :_reduce_none, + 2, 108, :_reduce_134, + 3, 108, :_reduce_135, + 6, 108, :_reduce_136, + 4, 108, :_reduce_137, + 0, 125, :_reduce_138, + 0, 126, :_reduce_139, + 5, 109, :_reduce_140, + 3, 106, :_reduce_141, + 0, 127, :_reduce_142, + 3, 68, :_reduce_143, + 1, 78, :_reduce_none, + 0, 79, :_reduce_none, + 1, 79, :_reduce_none, + 1, 79, :_reduce_none, + 1, 79, :_reduce_none, + 1, 112, :_reduce_149 ] + +racc_reduce_n = 150 + +racc_shift_n = 258 racc_token_table = { false => 0, @@ -1075,57 +1102,62 @@ def raise_parse_error(error_message, location) :INTEGER => 6, :STRING => 7, :TAG => 8, - "%%" => 9, - "%{" => 10, - "%}" => 11, - "%require" => 12, - ";" => 13, - "%expect" => 14, - "%define" => 15, - "{" => 16, - "}" => 17, - "%param" => 18, - "%lex-param" => 19, - "%parse-param" => 20, - "%code" => 21, - "%initial-action" => 22, - "%no-stdlib" => 23, - "%locations" => 24, - "%union" => 25, - "%destructor" => 26, - "%printer" => 27, - "%error-token" => 28, - "%after-shift" => 29, - "%before-reduce" => 30, - "%after-reduce" => 31, - "%after-shift-error-token" => 32, - "%after-pop-stack" => 33, - "-temp-group" => 34, - "%token" => 35, - "%type" => 36, - "%nterm" => 37, - "%left" => 38, - "%right" => 39, - "%precedence" => 40, - "%nonassoc" => 41, - "%start" => 42, - "%rule" => 43, - "(" => 44, - ")" => 45, - ":" => 46, - "%inline" => 47, - "," => 48, - "|" => 49, - "%empty" => 50, - "%prec" => 51, - "?" => 52, - "+" => 53, - "*" => 54, - "[" => 55, - "]" => 56, - "{...}" => 57 } - -racc_nt_base = 58 + :REGEX => 9, + "%%" => 10, + "%{" => 11, + "%}" => 12, + "%require" => 13, + ";" => 14, + "%expect" => 15, + "%define" => 16, + "{" => 17, + "}" => 18, + "%param" => 19, + "%lex-param" => 20, + "%parse-param" => 21, + "%code" => 22, + "%initial-action" => 23, + "%no-stdlib" => 24, + "%locations" => 25, + "%union" => 26, + "%destructor" => 27, + "%printer" => 28, + "%error-token" => 29, + "%after-shift" => 30, + "%before-reduce" => 31, + "%after-reduce" => 32, + "%after-shift-error-token" => 33, + "%after-pop-stack" => 34, + "-temp-group" => 35, + "%token" => 36, + "%token-pattern" => 37, + "%lex-prec" => 38, + "%type" => 39, + "%nterm" => 40, + "%left" => 41, + "%right" => 42, + "%precedence" => 43, + "%nonassoc" => 44, + "%start" => 45, + "," => 46, + "-" => 47, + "-s" => 48, + "%rule" => 49, + "(" => 50, + ")" => 51, + ":" => 52, + "%inline" => 53, + "|" => 54, + "%empty" => 55, + "%prec" => 56, + "?" => 57, + "+" => 58, + "*" => 59, + "[" => 60, + "]" => 61, + "{...}" => 62 } + +racc_nt_base = 63 racc_use_result_var = true @@ -1156,6 +1188,7 @@ def raise_parse_error(error_message, location) "INTEGER", "STRING", "TAG", + "REGEX", "\"%%\"", "\"%{\"", "\"%}\"", @@ -1183,6 +1216,8 @@ def raise_parse_error(error_message, location) "\"%after-pop-stack\"", "\"-temp-group\"", "\"%token\"", + "\"%token-pattern\"", + "\"%lex-prec\"", "\"%type\"", "\"%nterm\"", "\"%left\"", @@ -1190,12 +1225,14 @@ def raise_parse_error(error_message, location) "\"%precedence\"", "\"%nonassoc\"", "\"%start\"", + "\",\"", + "\"-\"", + "\"-s\"", "\"%rule\"", "\"(\"", "\")\"", "\":\"", "\"%inline\"", - "\",\"", "\"|\"", "\"%empty\"", "\"%prec\"", @@ -1231,6 +1268,8 @@ def raise_parse_error(error_message, location) "\"-group@symbol|TAG\"", "\"-many1@-group@symbol|TAG\"", "token_declarations", + "token_pattern_declarations", + "lex_prec_declarations", "symbol_declarations", "token_declarations_for_precedence", "token_declaration", @@ -1239,6 +1278,10 @@ def raise_parse_error(error_message, location) "id", "alias", "\"-option@INTEGER\"", + "token_pattern_declaration", + "\"-many1@token_pattern_declaration\"", + "lex_prec_chain", + "lex_prec_op", "rule_args", "rule_rhs_list", "rule_rhs", @@ -1586,8 +1629,12 @@ def _reduce_45(val, _values, result) # reduce 46 omitted -module_eval(<<'.,.,', 'parser.y', 136) - def _reduce_47(val, _values, result) +# reduce 47 omitted + +# reduce 48 omitted + +module_eval(<<'.,.,', 'parser.y', 138) + def _reduce_49(val, _values, result) val[1].each {|hash| hash[:tokens].each {|id| @grammar.add_type(id: id, tag: hash[:tag]) @@ -1598,8 +1645,8 @@ def _reduce_47(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 144) - def _reduce_48(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 146) + def _reduce_50(val, _values, result) val[1].each {|hash| hash[:tokens].each {|id| if @grammar.find_term_by_s_value(id.s_value) @@ -1614,8 +1661,8 @@ def _reduce_48(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 156) - def _reduce_49(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 158) + def _reduce_51(val, _values, result) val[1].each {|hash| hash[:tokens].each {|id| sym = @grammar.add_term(id: id, tag: hash[:tag]) @@ -1628,8 +1675,8 @@ def _reduce_49(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 166) - def _reduce_50(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 168) + def _reduce_52(val, _values, result) val[1].each {|hash| hash[:tokens].each {|id| sym = @grammar.add_term(id: id, tag: hash[:tag]) @@ -1642,8 +1689,8 @@ def _reduce_50(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 176) - def _reduce_51(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 178) + def _reduce_53(val, _values, result) val[1].each {|hash| hash[:tokens].each {|id| sym = @grammar.add_term(id: id, tag: hash[:tag]) @@ -1656,8 +1703,8 @@ def _reduce_51(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 186) - def _reduce_52(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 188) + def _reduce_54(val, _values, result) val[1].each {|hash| hash[:tokens].each {|id| sym = @grammar.add_term(id: id, tag: hash[:tag]) @@ -1670,34 +1717,34 @@ def _reduce_52(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 196) - def _reduce_53(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 198) + def _reduce_55(val, _values, result) @grammar.set_start_nterm(val[1]) result end .,., -# reduce 54 omitted +# reduce 56 omitted -# reduce 55 omitted +# reduce 57 omitted -module_eval(<<'.,.,', 'parser.y', 214) - def _reduce_56(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 216) + def _reduce_58(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -module_eval(<<'.,.,', 'parser.y', 214) - def _reduce_57(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 216) + def _reduce_59(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -module_eval(<<'.,.,', 'parser.y', 202) - def _reduce_58(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 204) + def _reduce_60(val, _values, result) val[1].each {|token_declaration| @grammar.add_term(id: token_declaration[0], alias_name: token_declaration[2], token_id: token_declaration[1]&.s_value, tag: val[0], replace: true) } @@ -1706,8 +1753,8 @@ def _reduce_58(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 208) - def _reduce_59(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 210) + def _reduce_61(val, _values, result) val[2].each {|token_declaration| @grammar.add_term(id: token_declaration[0], alias_name: token_declaration[2], token_id: token_declaration[1]&.s_value, tag: val[1], replace: true) } @@ -1716,19 +1763,129 @@ def _reduce_59(val, _values, result) end .,., -# reduce 60 omitted +# reduce 62 omitted -# reduce 61 omitted +# reduce 63 omitted -module_eval(<<'.,.,', 'parser.y', 213) - def _reduce_62(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 215) + def _reduce_64(val, _values, result) result = val result end .,., -module_eval(<<'.,.,', 'parser.y', 218) - def _reduce_63(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 244) + def _reduce_65(val, _values, result) + result = val[1] ? val[1].unshift(val[0]) : val + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 244) + def _reduce_66(val, _values, result) + result = val[1] ? val[1].unshift(val[0]) : val + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 220) + def _reduce_67(val, _values, result) + val[1].each {|decl| + @grammar.add_token_pattern( + id: decl[:id], + pattern: decl[:pattern], + alias_name: decl[:alias], + tag: val[0], + lineno: decl[:id].first_line + ) + } + + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 232) + def _reduce_68(val, _values, result) + val[2].each {|decl| + @grammar.add_token_pattern( + id: decl[:id], + pattern: decl[:pattern], + alias_name: decl[:alias], + tag: val[1], + lineno: decl[:id].first_line + ) + } + + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 246) + def _reduce_69(val, _values, result) + result = { id: val[0], pattern: val[1], alias: val[2] } + + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 252) + def _reduce_70(val, _values, result) + val[0].each {|rule| + @grammar.add_lex_prec_rule( + left_token: rule[:left], + operator: rule[:op], + right_token: rule[:right], + lineno: rule[:left].first_line + ) + } + + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 265) + def _reduce_71(val, _values, result) + result = [{ left: val[0], op: val[1], right: val[2] }] + + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 269) + def _reduce_72(val, _values, result) + last_right = val[0].last[:right] + result = val[0] + [{ left: last_right, op: val[1], right: val[2] }] + + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 276) + def _reduce_73(val, _values, result) + result = Lrama::Grammar::LexPrec::SAME_PRIORITY + + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 280) + def _reduce_74(val, _values, result) + result = Lrama::Grammar::LexPrec::HIGHER + + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 284) + def _reduce_75(val, _values, result) + result = Lrama::Grammar::LexPrec::SHORTER + + result + end +.,., + +module_eval(<<'.,.,', 'parser.y', 290) + def _reduce_76(val, _values, result) rule = Grammar::Parameterized::Rule.new(val[1].s_value, val[3], val[7], tag: val[5]) @grammar.add_parameterized_rule(rule) @@ -1736,8 +1893,8 @@ def _reduce_63(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 225) - def _reduce_64(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 297) + def _reduce_77(val, _values, result) rule = Grammar::Parameterized::Rule.new(val[2].s_value, [], val[4], is_inline: true) @grammar.add_parameterized_rule(rule) @@ -1745,8 +1902,8 @@ def _reduce_64(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 230) - def _reduce_65(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 302) + def _reduce_78(val, _values, result) rule = Grammar::Parameterized::Rule.new(val[2].s_value, val[4], val[7], is_inline: true) @grammar.add_parameterized_rule(rule) @@ -1754,22 +1911,22 @@ def _reduce_65(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 235) - def _reduce_66(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 307) + def _reduce_79(val, _values, result) result = [val[0]] result end .,., -module_eval(<<'.,.,', 'parser.y', 236) - def _reduce_67(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 308) + def _reduce_80(val, _values, result) result = val[0].append(val[2]) result end .,., -module_eval(<<'.,.,', 'parser.y', 241) - def _reduce_68(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 313) + def _reduce_81(val, _values, result) builder = val[0] result = [builder] @@ -1777,8 +1934,8 @@ def _reduce_68(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 246) - def _reduce_69(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 318) + def _reduce_82(val, _values, result) builder = val[2] result = val[0].append(builder) @@ -1786,16 +1943,16 @@ def _reduce_69(val, _values, result) end .,., -# reduce 70 omitted +# reduce 83 omitted -# reduce 71 omitted +# reduce 84 omitted -# reduce 72 omitted +# reduce 85 omitted -# reduce 73 omitted +# reduce 86 omitted -module_eval(<<'.,.,', 'parser.y', 253) - def _reduce_74(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 325) + def _reduce_87(val, _values, result) reset_precs result = Grammar::Parameterized::Rhs.new @@ -1803,8 +1960,8 @@ def _reduce_74(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 258) - def _reduce_75(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 330) + def _reduce_88(val, _values, result) on_action_error("intermediate %prec in a rule", val[1]) if @trailing_prec_seen token = val[1] token.alias_name = val[2] @@ -1816,8 +1973,8 @@ def _reduce_75(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 267) - def _reduce_76(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 339) + def _reduce_89(val, _values, result) on_action_error("intermediate %prec in a rule", val[1]) if @trailing_prec_seen builder = val[0] builder.symbols << Lrama::Lexer::Token::InstantiateRule.new(s_value: val[2], location: @lexer.location, args: [val[1]]) @@ -1827,8 +1984,8 @@ def _reduce_76(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 274) - def _reduce_77(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 346) + def _reduce_90(val, _values, result) on_action_error("intermediate %prec in a rule", val[1]) if @trailing_prec_seen builder = val[0] builder.symbols << Lrama::Lexer::Token::InstantiateRule.new(s_value: val[1].s_value, alias_name: val[5], location: @lexer.location, args: val[3], lhs_tag: val[6]) @@ -1838,8 +1995,8 @@ def _reduce_77(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 281) - def _reduce_78(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 353) + def _reduce_91(val, _values, result) user_code = val[1] user_code.alias_name = val[2] builder = val[0] @@ -1850,8 +2007,8 @@ def _reduce_78(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 289) - def _reduce_79(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 361) + def _reduce_92(val, _values, result) on_action_error("multiple %prec in a rule", val[0]) if prec_seen? sym = @grammar.find_symbol_by_id!(val[2]) if val[0].rhs.empty? @@ -1867,33 +2024,33 @@ def _reduce_79(val, _values, result) end .,., -# reduce 80 omitted +# reduce 93 omitted -# reduce 81 omitted +# reduce 94 omitted -module_eval(<<'.,.,', 'parser.y', 301) - def _reduce_82(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 373) + def _reduce_95(val, _values, result) result = val[0].s_value if val[0] result end .,., -module_eval(<<'.,.,', 'parser.y', 315) - def _reduce_83(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 387) + def _reduce_96(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -module_eval(<<'.,.,', 'parser.y', 315) - def _reduce_84(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 387) + def _reduce_97(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -module_eval(<<'.,.,', 'parser.y', 306) - def _reduce_85(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 378) + def _reduce_98(val, _values, result) result = if val[0] [{tag: val[0], tokens: val[1]}] else @@ -1904,121 +2061,121 @@ def _reduce_85(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 312) - def _reduce_86(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 384) + def _reduce_99(val, _values, result) result = val[0].append({tag: val[1], tokens: val[2]}) result end .,., -# reduce 87 omitted +# reduce 100 omitted -# reduce 88 omitted +# reduce 101 omitted -module_eval(<<'.,.,', 'parser.y', 321) - def _reduce_89(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 393) + def _reduce_102(val, _values, result) begin_c_declaration("}") result end .,., -module_eval(<<'.,.,', 'parser.y', 325) - def _reduce_90(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 397) + def _reduce_103(val, _values, result) end_c_declaration result end .,., -module_eval(<<'.,.,', 'parser.y', 329) - def _reduce_91(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 401) + def _reduce_104(val, _values, result) result = val[2] result end .,., -module_eval(<<'.,.,', 'parser.y', 338) - def _reduce_92(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 410) + def _reduce_105(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -module_eval(<<'.,.,', 'parser.y', 338) - def _reduce_93(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 410) + def _reduce_106(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -module_eval(<<'.,.,', 'parser.y', 338) - def _reduce_94(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 410) + def _reduce_107(val, _values, result) result = val result end .,., -module_eval(<<'.,.,', 'parser.y', 338) - def _reduce_95(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 410) + def _reduce_108(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -module_eval(<<'.,.,', 'parser.y', 338) - def _reduce_96(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 410) + def _reduce_109(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -module_eval(<<'.,.,', 'parser.y', 333) - def _reduce_97(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 405) + def _reduce_110(val, _values, result) result = [{tag: nil, tokens: val[0]}] result end .,., -module_eval(<<'.,.,', 'parser.y', 334) - def _reduce_98(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 406) + def _reduce_111(val, _values, result) result = val[0].map {|tag, ids| {tag: tag, tokens: ids} } result end .,., -module_eval(<<'.,.,', 'parser.y', 335) - def _reduce_99(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 407) + def _reduce_112(val, _values, result) result = [{tag: nil, tokens: val[0]}, {tag: val[1], tokens: val[2]}] result end .,., -# reduce 100 omitted +# reduce 113 omitted -# reduce 101 omitted +# reduce 114 omitted -module_eval(<<'.,.,', 'parser.y', 346) - def _reduce_102(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 418) + def _reduce_115(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -module_eval(<<'.,.,', 'parser.y', 346) - def _reduce_103(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 418) + def _reduce_116(val, _values, result) result = val[1] ? val[1].unshift(val[0]) : val result end .,., -# reduce 104 omitted +# reduce 117 omitted -# reduce 105 omitted +# reduce 118 omitted -module_eval(<<'.,.,', 'parser.y', 348) - def _reduce_106(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 420) + def _reduce_119(val, _values, result) lhs = val[0] lhs.alias_name = val[1] val[3].each do |builder| @@ -2031,8 +2188,8 @@ def _reduce_106(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 360) - def _reduce_107(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 432) + def _reduce_120(val, _values, result) if val[0].rhs.count > 1 empties = val[0].rhs.select { |sym| sym.is_a?(Lrama::Lexer::Token::Empty) } empties.each do |empty| @@ -2049,8 +2206,8 @@ def _reduce_107(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 374) - def _reduce_108(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 446) + def _reduce_121(val, _values, result) builder = val[2] if !builder.line builder.line = @lexer.line - 1 @@ -2061,8 +2218,8 @@ def _reduce_108(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 384) - def _reduce_109(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 456) + def _reduce_122(val, _values, result) reset_precs result = @grammar.create_rule_builder(@rule_counter, @midrule_action_counter) @@ -2070,8 +2227,8 @@ def _reduce_109(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 389) - def _reduce_110(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 461) + def _reduce_123(val, _values, result) builder = val[0] builder.add_rhs(Lrama::Lexer::Token::Empty.new(location: @lexer.location)) result = builder @@ -2080,8 +2237,8 @@ def _reduce_110(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 395) - def _reduce_111(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 467) + def _reduce_124(val, _values, result) on_action_error("intermediate %prec in a rule", val[1]) if @trailing_prec_seen token = val[1] token.alias_name = val[2] @@ -2093,8 +2250,8 @@ def _reduce_111(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 404) - def _reduce_112(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 476) + def _reduce_125(val, _values, result) on_action_error("intermediate %prec in a rule", val[1]) if @trailing_prec_seen token = Lrama::Lexer::Token::InstantiateRule.new(s_value: val[2], alias_name: val[3], location: @lexer.location, args: [val[1]], lhs_tag: val[4]) builder = val[0] @@ -2106,8 +2263,8 @@ def _reduce_112(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 413) - def _reduce_113(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 485) + def _reduce_126(val, _values, result) on_action_error("intermediate %prec in a rule", val[1]) if @trailing_prec_seen token = Lrama::Lexer::Token::InstantiateRule.new(s_value: val[1].s_value, alias_name: val[5], location: @lexer.location, args: val[3], lhs_tag: val[6]) builder = val[0] @@ -2119,8 +2276,8 @@ def _reduce_113(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 422) - def _reduce_114(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 494) + def _reduce_127(val, _values, result) user_code = val[1] user_code.alias_name = val[2] user_code.tag = val[3] @@ -2132,8 +2289,8 @@ def _reduce_114(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 431) - def _reduce_115(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 503) + def _reduce_128(val, _values, result) on_action_error("multiple %prec in a rule", val[0]) if prec_seen? sym = @grammar.find_symbol_by_id!(val[2]) if val[0].rhs.empty? @@ -2149,33 +2306,33 @@ def _reduce_115(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 444) - def _reduce_116(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 516) + def _reduce_129(val, _values, result) result = "option" result end .,., -module_eval(<<'.,.,', 'parser.y', 445) - def _reduce_117(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 517) + def _reduce_130(val, _values, result) result = "nonempty_list" result end .,., -module_eval(<<'.,.,', 'parser.y', 446) - def _reduce_118(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 518) + def _reduce_131(val, _values, result) result = "list" result end .,., -# reduce 119 omitted +# reduce 132 omitted -# reduce 120 omitted +# reduce 133 omitted -module_eval(<<'.,.,', 'parser.y', 451) - def _reduce_121(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 523) + def _reduce_134(val, _values, result) result = if val[1] [Lrama::Lexer::Token::InstantiateRule.new(s_value: val[1].s_value, location: @lexer.location, args: val[0])] else @@ -2186,29 +2343,29 @@ def _reduce_121(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 457) - def _reduce_122(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 529) + def _reduce_135(val, _values, result) result = val[0].append(val[2]) result end .,., -module_eval(<<'.,.,', 'parser.y', 458) - def _reduce_123(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 530) + def _reduce_136(val, _values, result) result = val[0].append(Lrama::Lexer::Token::InstantiateRule.new(s_value: val[2].s_value, location: @lexer.location, args: val[4])) result end .,., -module_eval(<<'.,.,', 'parser.y', 459) - def _reduce_124(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 531) + def _reduce_137(val, _values, result) result = [Lrama::Lexer::Token::InstantiateRule.new(s_value: val[0].s_value, location: @lexer.location, args: val[2])] result end .,., -module_eval(<<'.,.,', 'parser.y', 464) - def _reduce_125(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 536) + def _reduce_138(val, _values, result) if prec_seen? on_action_error("multiple User_code after %prec", val[0]) if @code_after_prec @code_after_prec = true @@ -2219,39 +2376,39 @@ def _reduce_125(val, _values, result) end .,., -module_eval(<<'.,.,', 'parser.y', 472) - def _reduce_126(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 544) + def _reduce_139(val, _values, result) end_c_declaration result end .,., -module_eval(<<'.,.,', 'parser.y', 476) - def _reduce_127(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 548) + def _reduce_140(val, _values, result) result = val[2] result end .,., -module_eval(<<'.,.,', 'parser.y', 479) - def _reduce_128(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 551) + def _reduce_141(val, _values, result) result = val[1].s_value result end .,., -module_eval(<<'.,.,', 'parser.y', 484) - def _reduce_129(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 556) + def _reduce_142(val, _values, result) begin_c_declaration('\Z') result end .,., -module_eval(<<'.,.,', 'parser.y', 488) - def _reduce_130(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 560) + def _reduce_143(val, _values, result) end_c_declaration @grammar.epilogue_first_lineno = val[0].first_line + 1 @grammar.epilogue = val[2].s_value @@ -2260,18 +2417,18 @@ def _reduce_130(val, _values, result) end .,., -# reduce 131 omitted +# reduce 144 omitted -# reduce 132 omitted +# reduce 145 omitted -# reduce 133 omitted +# reduce 146 omitted -# reduce 134 omitted +# reduce 147 omitted -# reduce 135 omitted +# reduce 148 omitted -module_eval(<<'.,.,', 'parser.y', 500) - def _reduce_136(val, _values, result) +module_eval(<<'.,.,', 'parser.y', 572) + def _reduce_149(val, _values, result) result = Lrama::Lexer::Token::Ident.new(s_value: val[0].s_value) result end diff --git a/lib/lrama/reporter.rb b/lib/lrama/reporter.rb index ed25cc7f..a9820385 100644 --- a/lib/lrama/reporter.rb +++ b/lib/lrama/reporter.rb @@ -5,6 +5,7 @@ require_relative 'reporter/grammar' require_relative 'reporter/precedences' require_relative 'reporter/profile' +require_relative 'reporter/pslr' require_relative 'reporter/rules' require_relative 'reporter/states' require_relative 'reporter/terms' @@ -20,6 +21,7 @@ def initialize(**options) @terms = Terms.new(**options) @conflicts = Conflicts.new @precedences = Precedences.new + @pslr = Pslr.new(**options) @grammar = Grammar.new(**options) @states = States.new(**options) end @@ -31,6 +33,7 @@ def report(io, states) report_duration(:report_terms) { @terms.report(io, states) } report_duration(:report_conflicts) { @conflicts.report(io, states) } report_duration(:report_precedences) { @precedences.report(io, states) } + report_duration(:report_pslr) { @pslr.report(io, states) } report_duration(:report_grammar) { @grammar.report(io, states) } report_duration(:report_states) { @states.report(io, states, ielr: states.ielr_defined?) } end diff --git a/lib/lrama/reporter/pslr.rb b/lib/lrama/reporter/pslr.rb new file mode 100644 index 00000000..766c5855 --- /dev/null +++ b/lib/lrama/reporter/pslr.rb @@ -0,0 +1,42 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class Reporter + class Pslr + # @rbs (?pslr: bool, **bool _) -> void + def initialize(pslr: false, **_) + @pslr = pslr + end + + # @rbs (IO io, Lrama::States states) -> void + def report(io, states) + return unless @pslr + return unless states.pslr_defined? + + metrics = states.pslr_metrics + + io << "PSLR Summary\n\n" + io << " Base states: #{metrics[:base_states_count]}\n" + io << " Total states: #{metrics[:total_states_count]}\n" + io << " Split states: #{metrics[:split_state_count]}\n" + io << " State growth: +#{metrics[:growth_count]} (#{format_ratio(metrics[:growth_ratio])})\n" + io << " Token patterns: #{metrics[:token_pattern_count]}\n" + io << " Scanner states: #{metrics[:scanner_fsa_state_count]}\n" + io << " Inadequacies: #{metrics[:inadequacies_count]}\n" + io << " Max states: #{states.pslr_max_states || 'unbounded'}\n" + io << " Max ratio: #{states.pslr_max_state_ratio || 'unbounded'}\n" + io << "\n" + end + + private + + # @rbs (Numeric?) -> String + def format_ratio(value) + return "n/a" if value.nil? + + "#{format('%.2f', value)}x" + end + end + end +end diff --git a/lib/lrama/scanner_fsa.rb b/lib/lrama/scanner_fsa.rb new file mode 100644 index 00000000..8ecc8186 --- /dev/null +++ b/lib/lrama/scanner_fsa.rb @@ -0,0 +1,506 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + # Scanner Finite State Automaton for PSLR(1) + # Built from token patterns defined by %token-pattern directives + # Based on Definitions 3.2.12, 3.2.13 from the PSLR dissertation + class ScannerFSA + # Represents a state in the scanner FSA + class State + attr_reader :id #: Integer + attr_reader :transitions #: Hash[String, Integer] + attr_reader :accepting_tokens #: Array[Grammar::TokenPattern] + + # @rbs (Integer id) -> void + def initialize(id) + @id = id + @transitions = {} + @accepting_tokens = [] + end + + # @rbs () -> bool + def accepting? + !@accepting_tokens.empty? + end + + # @rbs (String char, Integer target_state_id) -> void + def add_transition(char, target_state_id) + @transitions[char] = target_state_id + end + + # @rbs (Grammar::TokenPattern token_pattern) -> void + def add_accepting_token(token_pattern) + @accepting_tokens << token_pattern + end + end + + attr_reader :states #: Array[State] + attr_reader :initial_state #: State + attr_reader :token_patterns #: Array[Grammar::TokenPattern] + + # @rbs (Array[Grammar::TokenPattern] token_patterns) -> void + def initialize(token_patterns) + @token_patterns = token_patterns + @states = [] + @state_counter = 0 + build_fsa + end + + # Returns the accepting state for a given FSA state + # Definition 3.2.13 (state_to_accepting_state) + # @rbs (Integer state_id) -> State? + def state_to_accepting_state(state_id) + state = @states[state_id] + return nil unless state&.accepting? + state + end + + # Returns the set of tokens accepted at FSA state ss + # Definition 3.2.12 acc(ss) + # @rbs (Integer state_id) -> Array[Grammar::TokenPattern] + def acc_ss(state_id) + state = @states[state_id] + return [] unless state + state.accepting_tokens + end + + # Simulate the FSA on input string starting from initial state + # Returns all accepting states reached during the scan + # @rbs (String input) -> Array[{state: State, position: Integer, token: Grammar::TokenPattern}] + def scan(input) + results = [] + current_state_id = 0 + + input.each_char.with_index do |char, index| + current_state = @states[current_state_id] + break unless current_state + + next_state_id = current_state.transitions[char] + break unless next_state_id + + current_state_id = next_state_id + next_state = @states[next_state_id] + + if next_state.accepting? + next_state.accepting_tokens.each do |token_pattern| + results << { state: next_state, position: index + 1, token: token_pattern } + end + end + end + + results + end + + private + + # Build the FSA from token patterns + # Uses Thompson's construction for NFAs followed by subset construction for DFA + # @rbs () -> void + def build_fsa + return if @token_patterns.empty? + + # Create initial state + @initial_state = create_state + + # Build NFA for each token pattern and convert to DFA + nfa_states = build_nfa + convert_nfa_to_dfa(nfa_states) + end + + # @rbs () -> State + def create_state + state = State.new(@state_counter) + @state_counter += 1 + @states << state + state + end + + # Simple NFA state for regex compilation + class NFAState + attr_reader :id #: Integer + attr_accessor :transitions #: Hash[String?, Array[NFAState]] + attr_accessor :accepting_token #: Grammar::TokenPattern? + + # @rbs (Integer id) -> void + def initialize(id) + @id = id + @transitions = Hash.new { |h, k| h[k] = [] } + @accepting_token = nil + end + + # @rbs (String? char, NFAState target) -> void + def add_transition(char, target) + @transitions[char] << target + end + + # @rbs () -> bool + def accepting? + !@accepting_token.nil? + end + end + + # Build NFA from all token patterns + # @rbs () -> Array[NFAState] + def build_nfa + nfa_states = [] + nfa_counter = [0] + + # Create NFA start state + nfa_start = create_nfa_state(nfa_counter, nfa_states) + + @token_patterns.each do |token_pattern| + # Build NFA fragment for this pattern + start_state, end_state = compile_regex(token_pattern.regex_pattern, nfa_counter, nfa_states) + + # Connect NFA start to this pattern's start with epsilon + nfa_start.add_transition(nil, start_state) + + # Mark end state as accepting + end_state.accepting_token = token_pattern + end + + nfa_states + end + + # @rbs (Array[Integer] counter, Array[NFAState] states) -> NFAState + def create_nfa_state(counter, states) + state = NFAState.new(counter[0]) + counter[0] += 1 + states << state + state + end + + # Compile a regex pattern to NFA fragment + # Returns [start_state, end_state] + # @rbs (String pattern, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_regex(pattern, counter, states) + # Simple regex compiler supporting: + # - Literal characters + # - Character classes [...] + # - Quantifiers *, +, ? + # - Alternation | + # - Grouping () + + compile_sequence(pattern, 0, counter, states) + end + + # Compile a sequence of regex elements + # @rbs (String pattern, Integer pos, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_sequence(pattern, pos, counter, states) + fragments = [] + i = pos + + while i < pattern.length + char = pattern[i] + + case char + when '\\' + # Escape sequence + if i + 1 < pattern.length + i += 1 + next_char = pattern[i] + case next_char + when 'd' + # \d matches digit + frag = compile_char_class('0-9', counter, states) + when 'w' + # \w matches word character + frag = compile_char_class('a-zA-Z0-9_', counter, states) + when 's' + # \s matches whitespace + frag = compile_char_class(' \t\n\r\f\v', counter, states) + else + # Literal escaped character + frag = compile_literal(next_char, counter, states) + end + fragments << frag + end + when '[' + # Character class + class_end = pattern.index(']', i) + raise "Unclosed character class in pattern: #{pattern}" unless class_end + + char_class = pattern[i + 1...class_end] + frag = compile_char_class(char_class, counter, states) + fragments << frag + i = class_end + when '*', '+', '?' + # Quantifier - modify the last fragment + if fragments.empty? + raise "Quantifier #{char} without preceding element in pattern: #{pattern}" + end + last_frag = fragments.pop + quantified = apply_quantifier(last_frag, char, counter, states) + fragments << quantified + when '|' + # Alternation - compile remaining and merge + left_start, left_end = concatenate_fragments(fragments, counter, states) + right_start, right_end = compile_sequence(pattern, i + 1, counter, states) + + # Create alternation + alt_start = create_nfa_state(counter, states) + alt_end = create_nfa_state(counter, states) + + alt_start.add_transition(nil, left_start) + alt_start.add_transition(nil, right_start) + left_end.add_transition(nil, alt_end) + right_end.add_transition(nil, alt_end) + + return [alt_start, alt_end] + when '(' + # Find matching closing paren + depth = 1 + j = i + 1 + while j < pattern.length && depth > 0 + if pattern[j] == '(' + depth += 1 + elsif pattern[j] == ')' + depth -= 1 + end + j += 1 + end + raise "Unclosed group in pattern: #{pattern}" if depth > 0 + + group_content = pattern[i + 1...j - 1] + frag = compile_sequence(group_content, 0, counter, states) + fragments << frag + i = j - 1 + when ')' + # End of group - return + break + when '.' + # Match any character (simplified: printable ASCII) + frag = compile_any_char(counter, states) + fragments << frag + else + # Literal character + frag = compile_literal(char, counter, states) + fragments << frag + end + + i += 1 + end + + if fragments.empty? + # Empty pattern + state = create_nfa_state(counter, states) + return [state, state] + end + + concatenate_fragments(fragments, counter, states) + end + + # Compile a single literal character + # @rbs (String char, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_literal(char, counter, states) + start_state = create_nfa_state(counter, states) + end_state = create_nfa_state(counter, states) + start_state.add_transition(char, end_state) + [start_state, end_state] + end + + # Compile a character class [...] + # @rbs (String char_class, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_char_class(char_class, counter, states) + start_state = create_nfa_state(counter, states) + end_state = create_nfa_state(counter, states) + + chars = expand_char_class(char_class) + chars.each do |c| + start_state.add_transition(c, end_state) + end + + [start_state, end_state] + end + + # Expand character class string to array of characters + # @rbs (String char_class) -> Array[String] + def expand_char_class(char_class) + chars = [] + i = 0 + negated = false + + if char_class[0] == '^' + negated = true + i = 1 + end + + while i < char_class.length + if i + 2 < char_class.length && char_class[i + 1] == '-' + # Range + start_char = char_class[i] + end_char = char_class[i + 2] + (start_char..end_char).each { |c| chars << c } + i += 3 + else + chars << char_class[i] + i += 1 + end + end + + if negated + all_printable = (32..126).map(&:chr) + chars = all_printable - chars + end + + chars + end + + # Compile . (any character) + # @rbs (Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_any_char(counter, states) + start_state = create_nfa_state(counter, states) + end_state = create_nfa_state(counter, states) + + # Match printable ASCII + (32..126).each do |code| + start_state.add_transition(code.chr, end_state) + end + + [start_state, end_state] + end + + # Apply a quantifier to a fragment + # @rbs ([NFAState, NFAState] fragment, String quantifier, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def apply_quantifier(fragment, quantifier, counter, states) + frag_start, frag_end = fragment + + case quantifier + when '*' + # Zero or more + new_start = create_nfa_state(counter, states) + new_end = create_nfa_state(counter, states) + + new_start.add_transition(nil, frag_start) + new_start.add_transition(nil, new_end) + frag_end.add_transition(nil, frag_start) + frag_end.add_transition(nil, new_end) + + [new_start, new_end] + when '+' + # One or more + new_end = create_nfa_state(counter, states) + + frag_end.add_transition(nil, frag_start) + frag_end.add_transition(nil, new_end) + + [frag_start, new_end] + when '?' + # Zero or one + new_start = create_nfa_state(counter, states) + new_end = create_nfa_state(counter, states) + + new_start.add_transition(nil, frag_start) + new_start.add_transition(nil, new_end) + frag_end.add_transition(nil, new_end) + + [new_start, new_end] + else + fragment + end + end + + # Concatenate multiple NFA fragments into one + # @rbs (Array[[NFAState, NFAState]] fragments, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def concatenate_fragments(fragments, counter, states) + return create_nfa_state(counter, states).then { |s| [s, s] } if fragments.empty? + return fragments[0] if fragments.size == 1 + + result_start = fragments[0][0] + current_end = fragments[0][1] + + fragments[1..-1].each do |frag_start, frag_end| + current_end.add_transition(nil, frag_start) + current_end = frag_end + end + + [result_start, current_end] + end + + # Convert NFA to DFA using subset construction + # @rbs (Array[NFAState] nfa_states) -> void + def convert_nfa_to_dfa(nfa_states) + return if nfa_states.empty? + + # Clear existing DFA states + @states = [] + @state_counter = 0 + + # Compute epsilon closure of start state + nfa_start = nfa_states[0] + start_closure = epsilon_closure([nfa_start]) + + # Map NFA state sets to DFA states + dfa_states = {} + work_list = [start_closure] + dfa_states[start_closure.map(&:id).sort] = create_state + + @initial_state = @states[0] + + # Mark accepting tokens for initial state + start_closure.each do |nfa_state| + if nfa_state.accepting? + @initial_state.add_accepting_token(nfa_state.accepting_token) + end + end + + while !work_list.empty? + current_nfa_set = work_list.shift + current_dfa = dfa_states[current_nfa_set.map(&:id).sort] + + # Find all possible transitions + transitions = {} + current_nfa_set.each do |nfa_state| + nfa_state.transitions.each do |char, targets| + next if char.nil? # Skip epsilon transitions + transitions[char] ||= [] + transitions[char].concat(targets) + end + end + + transitions.each do |char, targets| + target_closure = epsilon_closure(targets.uniq) + target_key = target_closure.map(&:id).sort + + unless dfa_states.key?(target_key) + new_dfa_state = create_state + dfa_states[target_key] = new_dfa_state + + # Mark accepting tokens + target_closure.each do |nfa_state| + if nfa_state.accepting? + new_dfa_state.add_accepting_token(nfa_state.accepting_token) + end + end + + work_list << target_closure + end + + current_dfa.add_transition(char, dfa_states[target_key].id) + end + end + end + + # Compute epsilon closure of a set of NFA states + # @rbs (Array[NFAState] nfa_states) -> Array[NFAState] + def epsilon_closure(nfa_states) + closure = nfa_states.dup + work_list = nfa_states.dup + + while !work_list.empty? + state = work_list.shift + epsilon_targets = state.transitions[nil] || [] + + epsilon_targets.each do |target| + unless closure.include?(target) + closure << target + work_list << target + end + end + end + + closure + end + end +end diff --git a/lib/lrama/state.rb b/lib/lrama/state.rb index 50912e09..e29622c6 100644 --- a/lib/lrama/state.rb +++ b/lib/lrama/state.rb @@ -4,8 +4,10 @@ require_relative "state/action" require_relative "state/inadequacy_annotation" require_relative "state/item" +require_relative "state/pslr_inadequacy" require_relative "state/reduce_reduce_conflict" require_relative "state/resolved_conflict" +require_relative "state/scanner_accepts" require_relative "state/shift_reduce_conflict" module Lrama @@ -55,6 +57,7 @@ class State attr_accessor :follow_kernel_items #: Hash[Action::Goto, Hash[Item, bool]] attr_accessor :always_follows #: Hash[Action::Goto, Array[Grammar::Symbol]] attr_accessor :goto_follows #: Hash[Action::Goto, Array[Grammar::Symbol]] + attr_accessor :pslr_item_lookahead_set #: lookahead_set? # @rbs (Integer id, Grammar::Symbol accessing_symbol, Array[Item] kernels) -> void def initialize(id, accessing_symbol, kernels) @@ -78,6 +81,7 @@ def initialize(id, accessing_symbol, kernels) @follow_kernel_items = {} @always_follows = {} @goto_follows = {} + @pslr_item_lookahead_set = nil @lhs_contributions = {} @lane_items = {} end @@ -147,6 +151,18 @@ def set_look_ahead(rule, look_ahead) reduce.look_ahead = look_ahead end + # @rbs (Action::Reduce reduce) -> Array[Grammar::Symbol] + def acceptable_reduce_lookahead(reduce) + reduce.look_ahead || item_lookahead_set[reduce.item] || [] + end + + # @rbs (Action::Reduce reduce) -> Array[Grammar::Symbol] + def acceptable_pslr_reduce_lookahead(reduce) + return acceptable_reduce_lookahead(reduce) unless @pslr_item_lookahead_set + + @pslr_item_lookahead_set[reduce.item] || acceptable_reduce_lookahead(reduce) + end + # @rbs (Grammar::Rule rule, Hash[Grammar::Symbol, Array[Action::Goto]] sources) -> void def set_look_ahead_sources(rule, sources) reduce = reduces.find do |r| @@ -288,6 +304,16 @@ def split_state? # # @rbs (State next_state) -> lookahead_set def propagate_lookaheads(next_state) + propagate_lookaheads_with_filter(next_state, true) + end + + # @rbs (State next_state) -> lookahead_set + def propagate_lookaheads_without_filter(next_state) + propagate_lookaheads_with_filter(next_state, false) + end + + # @rbs (State next_state, bool apply_filter) -> lookahead_set + def propagate_lookaheads_with_filter(next_state, apply_filter) next_state.kernels.map {|next_kernel| lookahead_sets = if next_kernel.position > 1 @@ -297,7 +323,14 @@ def propagate_lookaheads(next_state) goto_follow_set(next_kernel.lhs) end - [next_kernel, lookahead_sets & next_state.lookahead_set_filters[next_kernel]] + lookahead_sets = + if apply_filter + lookahead_sets & next_state.lookahead_set_filters[next_kernel] + else + lookahead_sets + end + + [next_kernel, lookahead_sets] }.to_h end @@ -441,11 +474,14 @@ def item_lookahead_set [] elsif kernel.position > 1 prev_items = predecessors_with_item(kernel) - prev_items.map {|st, i| st.item_lookahead_set[i] }.reduce([]) {|acc, syms| acc |= syms } + prev_items + .map {|st, i| st.item_lookahead_set[i] } + .compact + .reduce([]) {|acc, syms| acc | syms } elsif kernel.position == 1 prev_state = @predecessors.find {|p| p.transitions.any? {|transition| transition.next_sym == kernel.lhs } } - goto = prev_state.nterm_transitions.find {|goto| goto.next_sym == kernel.lhs } - prev_state.goto_follows[goto] + goto = prev_state&.nterm_transitions&.find {|goto| goto.next_sym == kernel.lhs } + prev_state&.goto_follows&.fetch(goto, []) || [] end [kernel, value] }.to_h @@ -479,11 +515,15 @@ def append_predecessor(prev_state) def goto_follow_set(nterm_token) return [] if nterm_token.accept_symbol? goto = @lalr_isocore.nterm_transitions.find {|g| g.next_sym == nterm_token } + return [] unless goto + + base_terms = Array(@lalr_isocore.always_follows[goto]) @kernels .select {|kernel| @lalr_isocore.follow_kernel_items[goto][kernel] } .map {|kernel| item_lookahead_set[kernel] } - .reduce(@lalr_isocore.always_follows[goto]) {|result, terms| result |= terms } + .compact + .reduce(base_terms) {|result, terms| result | terms } end # Definition 3.8 (Goto Follows Internal Relation) diff --git a/lib/lrama/state/pslr_inadequacy.rb b/lib/lrama/state/pslr_inadequacy.rb new file mode 100644 index 00000000..451458cf --- /dev/null +++ b/lib/lrama/state/pslr_inadequacy.rb @@ -0,0 +1,79 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class State + # PSLR Inadequacy detection + # Based on Section 3.4.3 from the PSLR dissertation + # + # PSLR inadequacy occurs when state merging causes different + # pseudo-scanner behavior + class PslrInadequacy + # Inadequacy types + LR_RELATIVE = :lr_relative #: Symbol + PSLR_RELATIVE = :pslr_relative #: Symbol + + attr_reader :type #: Symbol + attr_reader :state #: State + attr_reader :conflicting_states #: Array[State] + attr_reader :details #: Hash[Symbol, untyped] + + # @rbs (type: Symbol, state: State, conflicting_states: Array[State], details: Hash[Symbol, untyped]) -> void + def initialize(type:, state:, conflicting_states:, details:) + @type = type + @state = state + @conflicting_states = conflicting_states + @details = details + end + + # @rbs () -> String + def to_s + message = "PSLR Inadequacy (#{type}): state #{state.id} conflicts with states #{conflicting_states.map(&:id).join(', ')}" + return message if details[:profiles].nil? + + profiles = details[:profiles].map do |profile, state_ids| + "#{state_ids.join(', ')} => #{profile.inspect}" + end + + "#{message} (profiles: #{profiles.join(' | ')})" + end + end + + # PSLR Compatibility checker + # Based on Definition 3.4.1 from the dissertation + class PslrCompatibilityChecker + # @rbs (ScannerAccepts scanner_accepts, LengthPrecedences length_prec) -> void + def initialize(scanner_accepts, length_prec) + @scanner_accepts = scanner_accepts + @length_prec = length_prec + end + + # Build a stable scanner profile for a parser state + # @rbs (State state, ScannerFSA scanner_fsa) -> Array[[Integer, String?]] + def profile(state, scanner_fsa) + scanner_fsa.states.each_with_object([]) do |fsa_state, result| + next unless fsa_state.accepting? + + token = @scanner_accepts[state.id, fsa_state.id] + result << [fsa_state.id, token&.name] + end + end + + # Partition states by scanner profile + # @rbs (Array[State] states, ScannerFSA scanner_fsa) -> Hash[Array[[Integer, String?]], Array[State]] + def group_by_profile(states, scanner_fsa) + states.group_by do |state| + profile(state, scanner_fsa) + end + end + + # Check if two states are PSLR-compatible + # Definition 3.4.1: States are compatible if for any input, + # the pseudo-scanner selects the same token + # @rbs (State s1, State s2, ScannerFSA scanner_fsa) -> bool + def compatible?(s1, s2, scanner_fsa) + profile(s1, scanner_fsa) == profile(s2, scanner_fsa) + end + end + end +end diff --git a/lib/lrama/state/scanner_accepts.rb b/lib/lrama/state/scanner_accepts.rb new file mode 100644 index 00000000..ef23e7d9 --- /dev/null +++ b/lib/lrama/state/scanner_accepts.rb @@ -0,0 +1,141 @@ +# rbs_inline: enabled +# frozen_string_literal: true + +module Lrama + class State + # Scanner accepts table for PSLR(1) + # Based on Definition 3.2.14 from the PSLR dissertation + # + # scanner_accepts[sp, sa]: For parser state sp and accepting state sa, + # returns the token that should be selected + class ScannerAccepts + attr_reader :table #: Hash[[Integer, Integer], Grammar::TokenPattern?] + + # @rbs (Array[State] parser_states, ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec) -> void + def initialize(parser_states, scanner_fsa, lex_prec, length_prec) + @parser_states = parser_states + @scanner_fsa = scanner_fsa + @lex_prec = lex_prec + @length_prec = length_prec + @table = {} + @profile_map = {} #: Hash[untyped, untyped] # Cache for conflict profile resolution + end + + # Build the scanner_accepts table + # Based on Definition 3.2.20 (compute_scanner_accepts) + # @rbs () -> void + def build + @parser_states.each do |parser_state| + compute_for_parser_state(parser_state) + end + end + + # Get the accepted token for a parser state and accepting state + # @rbs (Integer parser_state_id, Integer accepting_state_id) -> Grammar::TokenPattern? + def [](parser_state_id, accepting_state_id) + @table[[parser_state_id, accepting_state_id]] + end + + private + + # Compute scanner_accepts for a single parser state + # Uses DFS to explore the FSA state space + # @rbs (State parser_state) -> void + def compute_for_parser_state(parser_state) + visited = Set.new + dfs(parser_state, 0, visited) # Start from FSA initial state (id 0) + end + + # DFS exploration of FSA states + # @rbs (State parser_state, Integer fsa_state_id, Set[Integer] visited) -> void + def dfs(parser_state, fsa_state_id, visited) + return if visited.include?(fsa_state_id) + visited << fsa_state_id + + fsa_state = @scanner_fsa.states[fsa_state_id] + return unless fsa_state + + # If this is an accepting state, compute the accepted token + if fsa_state.accepting? + token = resolve(parser_state, fsa_state) + @table[[parser_state.id, fsa_state_id]] = token if token + end + + # Explore transitions + fsa_state.transitions.each_value do |next_state_id| + dfs(parser_state, next_state_id, visited) + end + end + + # Resolve which token should be accepted + # Based on Definition 3.2.19 (resolve) + # @rbs (State parser_state, ScannerFSA::State fsa_state) -> Grammar::TokenPattern? + def resolve(parser_state, fsa_state) + # Get tokens that are both: + # 1. Accepted by the FSA at this state (acc(ss)) + # 2. Accepted by the parser at this state (acc(sp)) + acc_ss = fsa_state.accepting_tokens + acc_sp = compute_acc_sp(parser_state) + + # Intersection: tokens that can be both scanned and parsed + acc_sp_ss = acc_ss.select do |token_pattern| + acc_sp.include?(token_pattern.name) + end + + return nil if acc_sp_ss.empty? + + # Select the highest priority token + select_best_token(acc_sp_ss) + end + + # Compute acc(sp): set of terminal symbols acceptable at parser state sp + # @rbs (State parser_state) -> Set[String] + def compute_acc_sp(parser_state) + tokens = Set.new + + # Add tokens from shift actions (term_transitions) + parser_state.term_transitions.each do |shift| + next_sym = shift.next_sym + tokens << next_sym.id.s_value if next_sym.term? + end + + # Add tokens from reduce actions (lookahead) + parser_state.reduces.each do |reduce| + parser_state.acceptable_pslr_reduce_lookahead(reduce).each do |la| + tokens << la.id.s_value + end + end + + tokens + end + + # Select the best token from candidates based on precedence rules + # @rbs (Array[Grammar::TokenPattern] candidates) -> Grammar::TokenPattern? + def select_best_token(candidates) + return candidates.first if candidates.size <= 1 + + # Sort by: + # 1. Explicit precedence (from %lex-prec - rules) + # 2. Definition order (first defined wins) + candidates.min_by do |token| + priority_rank(token, candidates) + end + end + + # Compute priority rank for a token among candidates + # Lower rank = higher priority + # @rbs (Grammar::TokenPattern token, Array[Grammar::TokenPattern] candidates) -> [Integer, Integer] + def priority_rank(token, candidates) + # Check if this token has explicit higher priority over others + higher_count = candidates.count do |other| + next false if other == token + @lex_prec.higher_priority?(token.name, other.name) + end + + # Tokens with more "higher than" relationships get lower rank + # Fallback to definition order + [-higher_count, token.definition_order] + end + end + end +end diff --git a/lib/lrama/states.rb b/lib/lrama/states.rb index ddce627d..835b6e52 100644 --- a/lib/lrama/states.rb +++ b/lib/lrama/states.rb @@ -2,6 +2,7 @@ # frozen_string_literal: true require "forwardable" +require "set" require_relative "tracer/duration" require_relative "state/item" @@ -36,12 +37,18 @@ class States include Lrama::Tracer::Duration def_delegators "@grammar", :symbols, :terms, :nterms, :rules, :precedences, - :accept_symbol, :eof_symbol, :undef_symbol, :find_symbol_by_s_value!, :ielr_defined? + :accept_symbol, :eof_symbol, :undef_symbol, :find_symbol_by_s_value!, :ielr_defined?, :pslr_defined?, + :token_patterns, :lex_prec, :pslr_max_states, :pslr_max_state_ratio attr_reader :states #: Array[State] attr_reader :reads_relation #: Hash[State::Action::Goto, Array[State::Action::Goto]] attr_reader :includes_relation #: Hash[State::Action::Goto, Array[State::Action::Goto]] attr_reader :lookback_relation #: Hash[state_id, Hash[rule_id, Array[State::Action::Goto]]] + attr_reader :scanner_fsa #: ScannerFSA? + attr_reader :length_precedences #: LengthPrecedences? + attr_reader :scanner_accepts_table #: State::ScannerAccepts? + attr_reader :pslr_inadequacies #: Array[State::PslrInadequacy] + attr_reader :pslr_metrics #: Hash[Symbol, Integer | Float | nil] # @rbs (Grammar grammar, Tracer tracer) -> void def initialize(grammar, tracer) @@ -105,6 +112,17 @@ def initialize(grammar, tracer) # second key is rule_id, # value is bitmap of term. @la = {} + @pslr_inadequacies = [] + @pslr_metrics = { + base_states_count: nil, + total_states_count: nil, + split_state_count: 0, + growth_count: 0, + growth_ratio: nil, + token_pattern_count: 0, + scanner_fsa_state_count: 0, + inadequacies_count: 0 + } end # @rbs () -> void @@ -141,6 +159,37 @@ def compute_ielr report_duration(:compute_default_reduction) { compute_default_reduction } end + # Compute PSLR(1) states + # Based on Section 3.4 of the PSLR dissertation + # @rbs () -> void + def compute_pslr + capture_pslr_metrics_before_split + # Preparation + report_duration(:clear_conflicts) { clear_conflicts } + # Phase 1 + report_duration(:compute_predecessors) { compute_predecessors } + report_duration(:compute_follow_kernel_items) { compute_follow_kernel_items } + report_duration(:compute_always_follows) { compute_always_follows } + report_duration(:compute_goto_follows) { compute_goto_follows } + # Phase 2 + report_duration(:build_scanner_fsa) { build_scanner_fsa } + report_duration(:build_length_precedences) { build_length_precedences } + report_duration(:compute_inadequacy_annotations) { compute_inadequacy_annotations } + # Phase 3 + @pslr_split_enabled = true + report_duration(:split_states) { split_states } + @pslr_split_enabled = false + # Phase 4 + report_duration(:clear_look_ahead_sets) { clear_look_ahead_sets } + report_duration(:compute_look_ahead_sets) { compute_look_ahead_sets } + # Phase 5 + report_duration(:compute_conflicts) { compute_conflicts(:ielr) } + report_duration(:compute_default_reduction) { compute_default_reduction } + report_duration(:build_scanner_accepts) { build_scanner_accepts } + report_duration(:handle_pslr_inadequacies) { handle_pslr_inadequacies } + finalize_pslr_metrics + end + # @rbs () -> Integer def states_count @states.count @@ -189,6 +238,8 @@ def rr_conflicts_count # @rbs (Logger logger) -> void def validate!(logger) validate_conflicts_within_threshold!(logger) + validate_pslr_state_growth!(logger) + validate_pslr_inadequacies!(logger) end def compute_la_sources_for_conflicted_states @@ -755,12 +806,26 @@ def compute_always_follows_bitmaps # @rbs () -> void def split_states @states.each do |state| - state.transitions.each do |transition| + state.transitions.dup.each do |transition| compute_state(state, transition, transition.to_state) end end end + # @rbs () -> void + def capture_pslr_metrics_before_split + @pslr_metrics = { + base_states_count: @states.count, + total_states_count: @states.count, + split_state_count: 0, + growth_count: 0, + growth_ratio: 1.0, + token_pattern_count: token_patterns.size, + scanner_fsa_state_count: 0, + inadequacies_count: 0 + } + end + # @rbs () -> void def compute_inadequacy_annotations @states.each do |state| @@ -782,17 +847,32 @@ def compute_inadequacy_annotations def merge_lookaheads(state, filtered_lookaheads) return if state.kernels.all? {|item| (filtered_lookaheads[item] - state.item_lookahead_set[item]).empty? } - state.item_lookahead_set = state.item_lookahead_set.merge {|_, v1, v2| v1 | v2 } + state.item_lookahead_set = state.item_lookahead_set.merge(filtered_lookaheads) {|_, v1, v2| v1 | v2 } state.transitions.each do |transition| next if transition.to_state.lookaheads_recomputed compute_state(state, transition, transition.to_state) end end + # @rbs (State state, State::lookahead_set pslr_lookaheads) -> void + def merge_pslr_lookaheads(state, pslr_lookaheads) + state.pslr_item_lookahead_set ||= state.kernels.map {|kernel| [kernel, []] }.to_h + return if state.kernels.all? {|item| (pslr_lookaheads[item] - state.pslr_item_lookahead_set[item]).empty? } + + state.pslr_item_lookahead_set = state.pslr_item_lookahead_set.merge(pslr_lookaheads) {|_, v1, v2| v1 | v2 } + end + # @rbs (State state, State::Action::Shift | State::Action::Goto transition, State next_state) -> void def compute_state(state, transition, next_state) propagating_lookaheads = state.propagate_lookaheads(next_state) - s = next_state.ielr_isocores.find {|st| st.is_compatible?(propagating_lookaheads) } + pslr_lookaheads = + if @pslr_split_enabled + state.propagate_lookaheads_without_filter(next_state) + else + propagating_lookaheads + end + + s = next_state.ielr_isocores.find {|st| compatible_split_state?(st, propagating_lookaheads, pslr_lookaheads) } if s.nil? s = next_state.lalr_isocore @@ -809,17 +889,87 @@ def compute_state(state, transition, next_state) st.ielr_isocores = s.ielr_isocores end new_state.lookaheads_recomputed = true - new_state.item_lookahead_set = propagating_lookaheads + new_state.item_lookahead_set = pslr_lookaheads + new_state.pslr_item_lookahead_set = pslr_lookaheads state.update_transition(transition, new_state) elsif(!s.lookaheads_recomputed) s.lookaheads_recomputed = true - s.item_lookahead_set = propagating_lookaheads + s.item_lookahead_set = pslr_lookaheads + s.pslr_item_lookahead_set = pslr_lookaheads else + merge_pslr_lookaheads(s, pslr_lookaheads) if @pslr_split_enabled merge_lookaheads(s, propagating_lookaheads) state.update_transition(transition, s) if state.items_to_state[transition.to_items].id != s.id end end + # @rbs (State state, State::lookahead_set filtered_lookaheads, ?State::lookahead_set pslr_lookaheads) -> bool + def compatible_split_state?(state, filtered_lookaheads, pslr_lookaheads = nil) + return false unless state.is_compatible?(filtered_lookaheads) + return true unless @pslr_split_enabled && @scanner_fsa + + pslr_lookaheads ||= filtered_lookaheads + + pslr_state_signature(state) == pslr_state_signature(state, pslr_lookaheads) + end + + # @rbs (State state, ?State::lookahead_set filtered_lookaheads) -> Array[[Integer, String?]] + def pslr_state_signature(state, filtered_lookaheads = nil) + return [] unless @scanner_fsa + + acc_sp = acceptable_tokens_for_pslr(state, filtered_lookaheads) + + @scanner_fsa.states.each_with_object([]) do |fsa_state, signature| + next unless fsa_state.accepting? + + candidates = fsa_state.accepting_tokens.select do |token_pattern| + acc_sp.include?(token_pattern.name) + end + signature << [fsa_state.id, select_best_pslr_token(candidates)&.name] + end + end + + # @rbs (State state, ?State::lookahead_set filtered_lookaheads) -> Set[String] + def acceptable_tokens_for_pslr(state, filtered_lookaheads = nil) + tokens = Set.new + kernel_reduce_items = state.kernels.select(&:end_of_rule?).to_set + + state.term_transitions.each do |shift| + next_sym = shift.next_sym + tokens << next_sym.id.s_value if next_sym.term? + end + + state.reduces.each do |reduce| + look_ahead = + if filtered_lookaheads && kernel_reduce_items.include?(reduce.item) + filtered_lookaheads[reduce.item] || [] + else + state.acceptable_pslr_reduce_lookahead(reduce) + end + + look_ahead.each do |la| + tokens << la.id.s_value + end + end + + tokens + end + + # @rbs (Array[Grammar::TokenPattern] candidates) -> Grammar::TokenPattern? + def select_best_pslr_token(candidates) + return nil if candidates.empty? + return candidates.first if candidates.size == 1 + + candidates.min_by do |token| + higher_count = candidates.count do |other| + next false if other == token + lex_prec.higher_priority?(token.name, other.name) + end + + [-higher_count, token.definition_order] + end + end + # @rbs (Logger logger) -> void def validate_conflicts_within_threshold!(logger) exit false unless conflicts_within_threshold?(logger) @@ -863,5 +1013,142 @@ def clear_look_ahead_sets @_follow_sets = nil @_la = nil end + + # Build Scanner FSA from token patterns + # @rbs () -> void + def build_scanner_fsa + return if token_patterns.empty? + + @scanner_fsa = ScannerFSA.new(token_patterns) + end + + # Build length precedences table + # @rbs () -> void + def build_length_precedences + @length_precedences = LengthPrecedences.new(lex_prec) + end + + # Build scanner_accepts table + # @rbs () -> void + def build_scanner_accepts + return unless @scanner_fsa + + @scanner_accepts_table = State::ScannerAccepts.new( + @states, + @scanner_fsa, + lex_prec, + @length_precedences + ) + @scanner_accepts_table.build + end + + # Handle PSLR inadequacies + # Detects and splits states where pseudo-scanner behavior differs + # @rbs () -> void + def handle_pslr_inadequacies + return unless @scanner_fsa && @scanner_accepts_table + + @pslr_inadequacies = detect_pslr_inadequacies + return if @pslr_inadequacies.empty? + + @tracer.warn("Detected #{@pslr_inadequacies.size} unresolved PSLR inadequacies") if @tracer.respond_to?(:warn) + end + + # @rbs () -> void + def finalize_pslr_metrics + return unless pslr_defined? + + base_states_count = @pslr_metrics[:base_states_count] || @states.count + total_states_count = @states.count + + @pslr_metrics = { + base_states_count: base_states_count, + total_states_count: total_states_count, + split_state_count: @states.count(&:split_state?), + growth_count: total_states_count - base_states_count, + growth_ratio: base_states_count.zero? ? nil : total_states_count.to_f / base_states_count, + token_pattern_count: token_patterns.size, + scanner_fsa_state_count: @scanner_fsa ? @scanner_fsa.states.size : 0, + inadequacies_count: @pslr_inadequacies.size + } + end + + # Detect PSLR inadequacies in isocore groups + # @rbs () -> Array[State::PslrInadequacy] + def detect_pslr_inadequacies + inadequacies = [] + + @states.each do |state| + state.transitions.each do |transition| + next_state = transition.to_state + next unless next_state + + propagating_lookaheads = state.propagate_lookaheads_without_filter(next_state.lalr_isocore) + expected_profile = pslr_state_signature(next_state, propagating_lookaheads) + actual_profile = pslr_state_signature(next_state) + + next if expected_profile == actual_profile + + matching_state = next_state.ielr_isocores.find do |candidate| + pslr_state_signature(candidate) == expected_profile + end + + inadequacies << State::PslrInadequacy.new( + type: State::PslrInadequacy::PSLR_RELATIVE, + state: next_state, + conflicting_states: [matching_state, next_state].compact.uniq, + details: { + reason: "Transition reaches a state with an incompatible PSLR scanner profile", + from_state_id: state.id, + transition_symbol: transition.next_sym.id.s_value, + expected_profile: expected_profile, + actual_profile: actual_profile, + matching_state_id: matching_state&.id + } + ) + end + end + + inadequacies + end + + # @rbs (Logger logger) -> void + def validate_pslr_inadequacies!(logger) + return unless pslr_defined? + return if @pslr_inadequacies.empty? + + @pslr_inadequacies.each do |inadequacy| + logger.error(inadequacy.to_s) + end + + exit false + end + + # @rbs (Logger logger) -> void + def validate_pslr_state_growth!(logger) + return unless pslr_defined? + + errors = [] + base_states_count = @pslr_metrics[:base_states_count] || @states.count + total_states_count = @pslr_metrics[:total_states_count] || @states.count + split_state_count = @pslr_metrics[:split_state_count] || @states.count(&:split_state?) + growth_ratio = @pslr_metrics[:growth_ratio] || 1.0 + + if (limit = pslr_max_states) && limit < total_states_count + errors << "PSLR state growth exceeded pslr.max-states=#{limit} (total=#{total_states_count}, base=#{base_states_count}, split=#{split_state_count})" + end + + if (limit = pslr_max_state_ratio) && limit < growth_ratio + errors << "PSLR state growth exceeded pslr.max-state-ratio=#{limit} (ratio=#{format('%.2f', growth_ratio)}x, total=#{total_states_count}, base=#{base_states_count})" + end + + return if errors.empty? + + errors.each do |message| + logger.error(message) + end + + exit false + end end end diff --git a/parser.y b/parser.y index f256d533..43623aed 100644 --- a/parser.y +++ b/parser.y @@ -2,7 +2,7 @@ class Lrama::Parser expect 0 error_on_expect_mismatch - token C_DECLARATION CHARACTER IDENT_COLON IDENTIFIER INTEGER STRING TAG + token C_DECLARATION CHARACTER IDENT_COLON IDENTIFIER INTEGER STRING TAG REGEX rule @@ -132,6 +132,8 @@ rule symbol_declaration: "%token" token_declarations + | "%token-pattern" token_pattern_declarations + | "%lex-prec" lex_prec_declarations | "%type" symbol_declarations { val[1].each {|hash| @@ -213,6 +215,76 @@ rule token_declaration: id INTEGER? alias { result = val } + token_pattern_declarations: + TAG? token_pattern_declaration+ + { + val[1].each {|decl| + @grammar.add_token_pattern( + id: decl[:id], + pattern: decl[:pattern], + alias_name: decl[:alias], + tag: val[0], + lineno: decl[:id].first_line + ) + } + } + | token_pattern_declarations TAG token_pattern_declaration+ + { + val[2].each {|decl| + @grammar.add_token_pattern( + id: decl[:id], + pattern: decl[:pattern], + alias_name: decl[:alias], + tag: val[1], + lineno: decl[:id].first_line + ) + } + } + + token_pattern_declaration: + IDENTIFIER REGEX alias + { + result = { id: val[0], pattern: val[1], alias: val[2] } + } + + lex_prec_declarations: + lex_prec_chain + { + val[0].each {|rule| + @grammar.add_lex_prec_rule( + left_token: rule[:left], + operator: rule[:op], + right_token: rule[:right], + lineno: rule[:left].first_line + ) + } + } + + lex_prec_chain: + IDENTIFIER lex_prec_op IDENTIFIER + { + result = [{ left: val[0], op: val[1], right: val[2] }] + } + | lex_prec_chain lex_prec_op IDENTIFIER + { + last_right = val[0].last[:right] + result = val[0] + [{ left: last_right, op: val[1], right: val[2] }] + } + + lex_prec_op: + "," + { + result = Lrama::Grammar::LexPrec::SAME_PRIORITY + } + | "-" + { + result = Lrama::Grammar::LexPrec::HIGHER + } + | "-s" + { + result = Lrama::Grammar::LexPrec::SHORTER + } + rule_declaration: "%rule" IDENTIFIER "(" rule_args ")" TAG? ":" rule_rhs_list { diff --git a/sig/generated/lrama/grammar.rbs b/sig/generated/lrama/grammar.rbs index faab4f04..c0d72733 100644 --- a/sig/generated/lrama/grammar.rbs +++ b/sig/generated/lrama/grammar.rbs @@ -24,6 +24,16 @@ module Lrama def find_symbol_by_s_value!: (::String s_value) -> Grammar::Symbol def ielr_defined?: () -> bool + + def pslr_defined?: () -> bool + + def token_patterns: () -> Array[Grammar::TokenPattern] + + def lex_prec: () -> Grammar::LexPrec + + def pslr_max_states: () -> Integer? + + def pslr_max_state_ratio: () -> Float? end include Symbols::Resolver::_DelegatedMethods @@ -76,6 +86,10 @@ module Lrama @start_nterm: Lrama::Lexer::Token::Base? + @token_patterns: Array[Grammar::TokenPattern] + + @lex_prec: Grammar::LexPrec + extend Forwardable attr_reader percent_codes: Array[PercentCode] @@ -136,6 +150,10 @@ module Lrama attr_accessor required: bool + attr_reader token_patterns: Array[Grammar::TokenPattern] + + attr_reader lex_prec: Grammar::LexPrec + # @rbs (Counter rule_counter, bool locations, Hash[String, String] define) -> void def initialize: (Counter rule_counter, bool locations, Hash[String, String] define) -> void @@ -227,8 +245,41 @@ module Lrama # @rbs () -> bool def ielr_defined?: () -> bool + # @rbs () -> bool + def pslr_defined?: () -> bool + + # @rbs () -> String? + def pslr_state_member: () -> String? + + # @rbs () -> Integer? + def pslr_max_states: () -> Integer? + + # @rbs () -> Float? + def pslr_max_state_ratio: () -> Float? + + # Add a token pattern from %token-pattern directive + # @rbs (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, ?alias_name: String?, ?tag: Lexer::Token::Tag?, lineno: Integer) -> Grammar::TokenPattern + def add_token_pattern: (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, lineno: Integer, ?alias_name: String?, ?tag: Lexer::Token::Tag?) -> Grammar::TokenPattern + + # Add a lex-prec rule from %lex-prec directive + # @rbs (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> Grammar::LexPrec::Rule + def add_lex_prec_rule: (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> Grammar::LexPrec::Rule + + # Find a token pattern by its name + # @rbs (String name) -> Grammar::TokenPattern? + def find_token_pattern: (String name) -> Grammar::TokenPattern? + private + # @rbs () -> void + def validate_pslr_configuration!: () -> void + + # @rbs (String key) -> Integer? + def parse_pslr_positive_integer: (String key) -> Integer? + + # @rbs (String key) -> Float? + def parse_pslr_positive_float: (String key) -> Float? + # @rbs () -> void def sort_precedence: () -> void diff --git a/sig/generated/lrama/grammar/lex_prec.rbs b/sig/generated/lrama/grammar/lex_prec.rbs new file mode 100644 index 00000000..fcb13076 --- /dev/null +++ b/sig/generated/lrama/grammar/lex_prec.rbs @@ -0,0 +1,64 @@ +# Generated from lib/lrama/grammar/lex_prec.rb with RBS::Inline + +module Lrama + class Grammar + # Represents lexical precedence rules defined by %lex-prec directive + # Based on Definition 3.2.3, 3.2.4, 3.2.10 from the PSLR dissertation + # + # Example: %lex-prec RANGLE -s RSHIFT # RANGLE is shorter than RSHIFT + # %lex-prec IF - ID # IF has higher priority than ID (same length) + class LexPrec + # Precedence relation types + # "," : Same priority (lex-tie) + # "-" : Left has higher priority than right + # "-s" : Left is shorter match priority over right + SAME_PRIORITY: Symbol + + HIGHER: Symbol + + SHORTER: Symbol + + # Represents a single precedence rule + class Rule + attr_reader left_token: Lexer::Token::Ident + + attr_reader operator: Symbol + + attr_reader right_token: Lexer::Token::Ident + + attr_reader lineno: Integer + + # @rbs (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> void + def initialize: (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> void + + # @rbs () -> String + def left_name: () -> String + + # @rbs () -> String + def right_name: () -> String + end + + attr_reader rules: Array[Rule] + + # @rbs () -> void + def initialize: () -> void + + # @rbs (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> Rule + def add_rule: (left_token: Lexer::Token::Ident, operator: Symbol, right_token: Lexer::Token::Ident, lineno: Integer) -> Rule + + # Check if token t1 has higher priority than t2 + # Based on Definition 3.2.4 + # @rbs (String t1, String t2) -> bool + def higher_priority?: (String t1, String t2) -> bool + + # Check if token t1 has shorter-match priority over t2 + # Based on Definition 3.2.15 + # @rbs (String t1, String t2) -> bool + def shorter_priority?: (String t1, String t2) -> bool + + # Check if tokens t1 and t2 are in a lex-tie relationship + # @rbs (String t1, String t2) -> bool + def same_priority?: (String t1, String t2) -> bool + end + end +end diff --git a/sig/generated/lrama/grammar/symbols/resolver.rbs b/sig/generated/lrama/grammar/symbols/resolver.rbs index 2e5f2ebf..8c4980b7 100644 --- a/sig/generated/lrama/grammar/symbols/resolver.rbs +++ b/sig/generated/lrama/grammar/symbols/resolver.rbs @@ -108,6 +108,9 @@ module Lrama # @rbs (Lexer::Token::Base id) -> Grammar::Symbol def find_nterm_by_id!: (Lexer::Token::Base id) -> Grammar::Symbol + # @rbs (Grammar::Symbol sym, id: Lexer::Token::Base, ?alias_name: String?, ?tag: Lexer::Token::Tag?, ?token_id: Integer?) -> void + def replace_term_attributes: (Grammar::Symbol sym, id: Lexer::Token::Base, ?alias_name: String?, ?tag: Lexer::Token::Tag?, ?token_id: Integer?) -> void + # @rbs () -> void def fill_terms_number: () -> void diff --git a/sig/generated/lrama/grammar/token_pattern.rbs b/sig/generated/lrama/grammar/token_pattern.rbs new file mode 100644 index 00000000..2add03da --- /dev/null +++ b/sig/generated/lrama/grammar/token_pattern.rbs @@ -0,0 +1,31 @@ +# Generated from lib/lrama/grammar/token_pattern.rb with RBS::Inline + +module Lrama + class Grammar + # Represents a token pattern defined by %token-pattern directive + # Example: %token-pattern RSHIFT />>/ "right shift" + class TokenPattern + attr_reader id: Lexer::Token::Ident + + attr_reader pattern: Lexer::Token::Regex + + attr_reader alias_name: String? + + attr_reader tag: Lexer::Token::Tag? + + attr_reader lineno: Integer + + attr_reader definition_order: Integer + + # @rbs (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, ?alias_name: String?, ?tag: Lexer::Token::Tag?, lineno: Integer, definition_order: Integer) -> void + def initialize: (id: Lexer::Token::Ident, pattern: Lexer::Token::Regex, lineno: Integer, definition_order: Integer, ?alias_name: String?, ?tag: Lexer::Token::Tag?) -> void + + # @rbs () -> String + def name: () -> String + + # Returns the regex pattern string (without slashes) + # @rbs () -> String + def regex_pattern: () -> String + end + end +end diff --git a/sig/generated/lrama/length_precedences.rbs b/sig/generated/lrama/length_precedences.rbs new file mode 100644 index 00000000..07e23d31 --- /dev/null +++ b/sig/generated/lrama/length_precedences.rbs @@ -0,0 +1,38 @@ +# Generated from lib/lrama/length_precedences.rb with RBS::Inline + +module Lrama + # Length precedences table for PSLR(1) + # Based on Definition 3.2.15 from the PSLR dissertation + # + # Determines which token should be preferred when there's a length conflict: + # - :left - the shorter token (t1) should be preferred + # - :right - the longer token (t2) should be preferred + # - :undefined - no preference defined, use default (longest match) + class LengthPrecedences + # Result of length precedence lookup + LEFT: Symbol + + RIGHT: Symbol + + UNDEFINED: Symbol + + attr_reader table: Hash[[ String, String ], Symbol] + + # @rbs (Grammar::LexPrec lex_prec) -> void + def initialize: (Grammar::LexPrec lex_prec) -> void + + # Get the length precedence between two tokens + # @rbs (String t1, String t2) -> Symbol + def precedence: (String t1, String t2) -> Symbol + + # Check if t1 (shorter) should be preferred over t2 (longer) + # @rbs (String t1, String t2) -> bool + def prefer_shorter?: (String t1, String t2) -> bool + + private + + # Build the length precedence table from lex-prec rules + # @rbs (Grammar::LexPrec lex_prec) -> Hash[[String, String], Symbol] + def build_table: (Grammar::LexPrec lex_prec) -> Hash[[ String, String ], Symbol] + end +end diff --git a/sig/generated/lrama/lexer.rbs b/sig/generated/lrama/lexer.rbs index 23202612..997ed3f2 100644 --- a/sig/generated/lrama/lexer.rbs +++ b/sig/generated/lrama/lexer.rbs @@ -4,7 +4,7 @@ module Lrama class Lexer type token = lexer_token | c_token - type lexer_token = [ String, Token::Token ] | [ ::Symbol, Token::Tag ] | [ ::Symbol, Token::Char ] | [ ::Symbol, Token::Str ] | [ ::Symbol, Token::Int ] | [ ::Symbol, Token::Ident ] + type lexer_token = [ String, Token::Token ] | [ ::Symbol, Token::Tag ] | [ ::Symbol, Token::Char ] | [ ::Symbol, Token::Str ] | [ ::Symbol, Token::Int ] | [ ::Symbol, Token::Ident ] | [ ::Symbol, Token::Regex ] type c_token = [ :C_DECLARATION, Token::UserCode ] diff --git a/sig/generated/lrama/lexer/token/regex.rbs b/sig/generated/lrama/lexer/token/regex.rbs new file mode 100644 index 00000000..b832c4be --- /dev/null +++ b/sig/generated/lrama/lexer/token/regex.rbs @@ -0,0 +1,15 @@ +# Generated from lib/lrama/lexer/token/regex.rb with RBS::Inline + +module Lrama + class Lexer + module Token + # Token class for regex patterns used in %token-pattern directive + # Example: /[a-zA-Z_][a-zA-Z0-9_]*/ + class Regex < Base + # Returns the regex pattern without the surrounding slashes + # @rbs () -> String + def pattern: () -> String + end + end + end +end diff --git a/sig/generated/lrama/reporter/pslr.rbs b/sig/generated/lrama/reporter/pslr.rbs new file mode 100644 index 00000000..c9de0805 --- /dev/null +++ b/sig/generated/lrama/reporter/pslr.rbs @@ -0,0 +1,18 @@ +# Generated from lib/lrama/reporter/pslr.rb with RBS::Inline + +module Lrama + class Reporter + class Pslr + # @rbs (?pslr: bool, **bool _) -> void + def initialize: (?pslr: bool, **bool _) -> void + + # @rbs (IO io, Lrama::States states) -> void + def report: (IO io, Lrama::States states) -> void + + private + + # @rbs (Numeric?) -> String + def format_ratio: (Numeric?) -> String + end + end +end diff --git a/sig/generated/lrama/scanner_fsa.rbs b/sig/generated/lrama/scanner_fsa.rbs new file mode 100644 index 00000000..490d7201 --- /dev/null +++ b/sig/generated/lrama/scanner_fsa.rbs @@ -0,0 +1,129 @@ +# Generated from lib/lrama/scanner_fsa.rb with RBS::Inline + +module Lrama + # Scanner Finite State Automaton for PSLR(1) + # Built from token patterns defined by %token-pattern directives + # Based on Definitions 3.2.12, 3.2.13 from the PSLR dissertation + class ScannerFSA + # Represents a state in the scanner FSA + class State + attr_reader id: Integer + + attr_reader transitions: Hash[String, Integer] + + attr_reader accepting_tokens: Array[Grammar::TokenPattern] + + # @rbs (Integer id) -> void + def initialize: (Integer id) -> void + + # @rbs () -> bool + def accepting?: () -> bool + + # @rbs (String char, Integer target_state_id) -> void + def add_transition: (String char, Integer target_state_id) -> void + + # @rbs (Grammar::TokenPattern token_pattern) -> void + def add_accepting_token: (Grammar::TokenPattern token_pattern) -> void + end + + attr_reader states: Array[State] + + attr_reader initial_state: State + + attr_reader token_patterns: Array[Grammar::TokenPattern] + + # @rbs (Array[Grammar::TokenPattern] token_patterns) -> void + def initialize: (Array[Grammar::TokenPattern] token_patterns) -> void + + # Returns the accepting state for a given FSA state + # Definition 3.2.13 (state_to_accepting_state) + # @rbs (Integer state_id) -> State? + def state_to_accepting_state: (Integer state_id) -> State? + + # Returns the set of tokens accepted at FSA state ss + # Definition 3.2.12 acc(ss) + # @rbs (Integer state_id) -> Array[Grammar::TokenPattern] + def acc_ss: (Integer state_id) -> Array[Grammar::TokenPattern] + + # Simulate the FSA on input string starting from initial state + # Returns all accepting states reached during the scan + # @rbs (String input) -> Array[{state: State, position: Integer, token: Grammar::TokenPattern}] + def scan: (String input) -> Array[{ state: State, position: Integer, token: Grammar::TokenPattern }] + + private + + # Build the FSA from token patterns + # Uses Thompson's construction for NFAs followed by subset construction for DFA + # @rbs () -> void + def build_fsa: () -> void + + # @rbs () -> State + def create_state: () -> State + + # Simple NFA state for regex compilation + class NFAState + attr_reader id: Integer + + attr_accessor transitions: Hash[String?, Array[NFAState]] + + attr_accessor accepting_token: Grammar::TokenPattern? + + # @rbs (Integer id) -> void + def initialize: (Integer id) -> void + + # @rbs (String? char, NFAState target) -> void + def add_transition: (String? char, NFAState target) -> void + + # @rbs () -> bool + def accepting?: () -> bool + end + + # Build NFA from all token patterns + # @rbs () -> Array[NFAState] + def build_nfa: () -> Array[NFAState] + + # @rbs (Array[Integer] counter, Array[NFAState] states) -> NFAState + def create_nfa_state: (Array[Integer] counter, Array[NFAState] states) -> NFAState + + # Compile a regex pattern to NFA fragment + # Returns [start_state, end_state] + # @rbs (String pattern, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_regex: (String pattern, Array[Integer] counter, Array[NFAState] states) -> [ NFAState, NFAState ] + + # Compile a sequence of regex elements + # @rbs (String pattern, Integer pos, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_sequence: (String pattern, Integer pos, Array[Integer] counter, Array[NFAState] states) -> [ NFAState, NFAState ] + + # Compile a single literal character + # @rbs (String char, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_literal: (String char, Array[Integer] counter, Array[NFAState] states) -> [ NFAState, NFAState ] + + # Compile a character class [...] + # @rbs (String char_class, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_char_class: (String char_class, Array[Integer] counter, Array[NFAState] states) -> [ NFAState, NFAState ] + + # Expand character class string to array of characters + # @rbs (String char_class) -> Array[String] + def expand_char_class: (String char_class) -> Array[String] + + # Compile . (any character) + # @rbs (Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def compile_any_char: (Array[Integer] counter, Array[NFAState] states) -> [ NFAState, NFAState ] + + # Apply a quantifier to a fragment + # @rbs ([NFAState, NFAState] fragment, String quantifier, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def apply_quantifier: ([ NFAState, NFAState ] fragment, String quantifier, Array[Integer] counter, Array[NFAState] states) -> [ NFAState, NFAState ] + + # Concatenate multiple NFA fragments into one + # @rbs (Array[[NFAState, NFAState]] fragments, Array[Integer] counter, Array[NFAState] states) -> [NFAState, NFAState] + def concatenate_fragments: (Array[[ NFAState, NFAState ]] fragments, Array[Integer] counter, Array[NFAState] states) -> [ NFAState, NFAState ] + + # Convert NFA to DFA using subset construction + # @rbs (Array[NFAState] nfa_states) -> void + def convert_nfa_to_dfa: (Array[NFAState] nfa_states) -> void + + # Compute epsilon closure of a set of NFA states + # @rbs (Array[NFAState] nfa_states) -> Array[NFAState] + def epsilon_closure: (Array[NFAState] nfa_states) -> Array[NFAState] + end +end diff --git a/sig/generated/lrama/state.rbs b/sig/generated/lrama/state.rbs index 8f585c33..c37eb6e6 100644 --- a/sig/generated/lrama/state.rbs +++ b/sig/generated/lrama/state.rbs @@ -76,6 +76,8 @@ module Lrama attr_accessor goto_follows: Hash[Action::Goto, Array[Grammar::Symbol]] + attr_accessor pslr_item_lookahead_set: lookahead_set? + # @rbs (Integer id, Grammar::Symbol accessing_symbol, Array[Item] kernels) -> void def initialize: (Integer id, Grammar::Symbol accessing_symbol, Array[Item] kernels) -> void @@ -100,6 +102,12 @@ module Lrama # @rbs (Grammar::Rule rule, Array[Grammar::Symbol] look_ahead) -> void def set_look_ahead: (Grammar::Rule rule, Array[Grammar::Symbol] look_ahead) -> void + # @rbs (Action::Reduce reduce) -> Array[Grammar::Symbol] + def acceptable_reduce_lookahead: (Action::Reduce reduce) -> Array[Grammar::Symbol] + + # @rbs (Action::Reduce reduce) -> Array[Grammar::Symbol] + def acceptable_pslr_reduce_lookahead: (Action::Reduce reduce) -> Array[Grammar::Symbol] + # @rbs (Grammar::Rule rule, Hash[Grammar::Symbol, Array[Action::Goto]] sources) -> void def set_look_ahead_sources: (Grammar::Rule rule, Hash[Grammar::Symbol, Array[Action::Goto]] sources) -> void @@ -158,6 +166,12 @@ module Lrama # @rbs (State next_state) -> lookahead_set def propagate_lookaheads: (State next_state) -> lookahead_set + # @rbs (State next_state) -> lookahead_set + def propagate_lookaheads_without_filter: (State next_state) -> lookahead_set + + # @rbs (State next_state, bool apply_filter) -> lookahead_set + def propagate_lookaheads_with_filter: (State next_state, bool apply_filter) -> lookahead_set + # Definition 3.43 (is_compatible) # # @rbs (lookahead_set filtered_lookahead) -> bool diff --git a/sig/generated/lrama/state/pslr_inadequacy.rbs b/sig/generated/lrama/state/pslr_inadequacy.rbs new file mode 100644 index 00000000..54c25cff --- /dev/null +++ b/sig/generated/lrama/state/pslr_inadequacy.rbs @@ -0,0 +1,52 @@ +# Generated from lib/lrama/state/pslr_inadequacy.rb with RBS::Inline + +module Lrama + class State + # PSLR Inadequacy detection + # Based on Section 3.4.3 from the PSLR dissertation + # + # PSLR inadequacy occurs when state merging causes different + # pseudo-scanner behavior + class PslrInadequacy + # Inadequacy types + LR_RELATIVE: Symbol + + PSLR_RELATIVE: Symbol + + attr_reader type: Symbol + + attr_reader state: State + + attr_reader conflicting_states: Array[State] + + attr_reader details: Hash[Symbol, untyped] + + # @rbs (type: Symbol, state: State, conflicting_states: Array[State], details: Hash[Symbol, untyped]) -> void + def initialize: (type: Symbol, state: State, conflicting_states: Array[State], details: Hash[Symbol, untyped]) -> void + + # @rbs () -> String + def to_s: () -> String + end + + # PSLR Compatibility checker + # Based on Definition 3.4.1 from the dissertation + class PslrCompatibilityChecker + # @rbs (ScannerAccepts scanner_accepts, LengthPrecedences length_prec) -> void + def initialize: (ScannerAccepts scanner_accepts, LengthPrecedences length_prec) -> void + + # Build a stable scanner profile for a parser state + # @rbs (State state, ScannerFSA scanner_fsa) -> Array[[Integer, String?]] + def profile: (State state, ScannerFSA scanner_fsa) -> Array[[ Integer, String? ]] + + # Partition states by scanner profile + # @rbs (Array[State] states, ScannerFSA scanner_fsa) -> Hash[Array[[Integer, String?]], Array[State]] + def group_by_profile: (Array[State] states, ScannerFSA scanner_fsa) -> Hash[Array[[ Integer, String? ]], Array[State]] + + # Check if two states are PSLR-compatible + # Definition 3.4.1: States are compatible if for any input, + # the pseudo-scanner selects the same token + # @rbs (State s1, State s2, ScannerFSA scanner_fsa) -> bool + def compatible?: (State s1, State s2, ScannerFSA scanner_fsa) -> bool + end + end +end diff --git a/sig/generated/lrama/state/scanner_accepts.rbs b/sig/generated/lrama/state/scanner_accepts.rbs new file mode 100644 index 00000000..0235f7f9 --- /dev/null +++ b/sig/generated/lrama/state/scanner_accepts.rbs @@ -0,0 +1,55 @@ +# Generated from lib/lrama/state/scanner_accepts.rb with RBS::Inline + +module Lrama + class State + # Scanner accepts table for PSLR(1) + # Based on Definition 3.2.14 from the PSLR dissertation + # + # scanner_accepts[sp, sa]: For parser state sp and accepting state sa, + # returns the token that should be selected + class ScannerAccepts + attr_reader table: Hash[[ Integer, Integer ], Grammar::TokenPattern?] + + # @rbs (Array[State] parser_states, ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec) -> void + def initialize: (Array[State] parser_states, ScannerFSA scanner_fsa, Grammar::LexPrec lex_prec, LengthPrecedences length_prec) -> void + + # Build the scanner_accepts table + # Based on Definition 3.2.20 (compute_scanner_accepts) + # @rbs () -> void + def build: () -> void + + # Get the accepted token for a parser state and accepting state + # @rbs (Integer parser_state_id, Integer accepting_state_id) -> Grammar::TokenPattern? + def []: (Integer parser_state_id, Integer accepting_state_id) -> Grammar::TokenPattern? + + private + + # Compute scanner_accepts for a single parser state + # Uses DFS to explore the FSA state space + # @rbs (State parser_state) -> void + def compute_for_parser_state: (State parser_state) -> void + + # DFS exploration of FSA states + # @rbs (State parser_state, Integer fsa_state_id, Set[Integer] visited) -> void + def dfs: (State parser_state, Integer fsa_state_id, Set[Integer] visited) -> void + + # Resolve which token should be accepted + # Based on Definition 3.2.19 (resolve) + # @rbs (State parser_state, ScannerFSA::State fsa_state) -> Grammar::TokenPattern? + def resolve: (State parser_state, ScannerFSA::State fsa_state) -> Grammar::TokenPattern? + + # Compute acc(sp): set of terminal symbols acceptable at parser state sp + # @rbs (State parser_state) -> Set[String] + def compute_acc_sp: (State parser_state) -> Set[String] + + # Select the best token from candidates based on precedence rules + # @rbs (Array[Grammar::TokenPattern] candidates) -> Grammar::TokenPattern? + def select_best_token: (Array[Grammar::TokenPattern] candidates) -> Grammar::TokenPattern? + + # Compute priority rank for a token among candidates + # Lower rank = higher priority + # @rbs (Grammar::TokenPattern token, Array[Grammar::TokenPattern] candidates) -> [Integer, Integer] + def priority_rank: (Grammar::TokenPattern token, Array[Grammar::TokenPattern] candidates) -> [ Integer, Integer ] + end + end +end diff --git a/sig/generated/lrama/states.rbs b/sig/generated/lrama/states.rbs index 8e4b296e..5766dedc 100644 --- a/sig/generated/lrama/states.rbs +++ b/sig/generated/lrama/states.rbs @@ -44,6 +44,16 @@ module Lrama attr_reader lookback_relation: Hash[state_id, Hash[rule_id, Array[State::Action::Goto]]] + attr_reader scanner_fsa: ScannerFSA? + + attr_reader length_precedences: LengthPrecedences? + + attr_reader scanner_accepts_table: State::ScannerAccepts? + + attr_reader pslr_inadequacies: Array[State::PslrInadequacy] + + attr_reader pslr_metrics: Hash[Symbol, Integer | Float | nil] + # @rbs (Grammar grammar, Tracer tracer) -> void def initialize: (Grammar grammar, Tracer tracer) -> void @@ -53,6 +63,11 @@ module Lrama # @rbs () -> void def compute_ielr: () -> void + # Compute PSLR(1) states + # Based on Section 3.4 of the PSLR dissertation + # @rbs () -> void + def compute_pslr: () -> void + # @rbs () -> Integer def states_count: () -> Integer @@ -188,15 +203,33 @@ module Lrama # @rbs () -> void def split_states: () -> void + # @rbs () -> void + def capture_pslr_metrics_before_split: () -> void + # @rbs () -> void def compute_inadequacy_annotations: () -> void # @rbs (State state, State::lookahead_set filtered_lookaheads) -> void def merge_lookaheads: (State state, State::lookahead_set filtered_lookaheads) -> void + # @rbs (State state, State::lookahead_set pslr_lookaheads) -> void + def merge_pslr_lookaheads: (State state, State::lookahead_set pslr_lookaheads) -> void + # @rbs (State state, State::Action::Shift | State::Action::Goto transition, State next_state) -> void def compute_state: (State state, State::Action::Shift | State::Action::Goto transition, State next_state) -> void + # @rbs (State state, State::lookahead_set filtered_lookaheads, ?State::lookahead_set pslr_lookaheads) -> bool + def compatible_split_state?: (State state, State::lookahead_set filtered_lookaheads, ?State::lookahead_set pslr_lookaheads) -> bool + + # @rbs (State state, ?State::lookahead_set filtered_lookaheads) -> Array[[Integer, String?]] + def pslr_state_signature: (State state, ?State::lookahead_set filtered_lookaheads) -> Array[[ Integer, String? ]] + + # @rbs (State state, ?State::lookahead_set filtered_lookaheads) -> Set[String] + def acceptable_tokens_for_pslr: (State state, ?State::lookahead_set filtered_lookaheads) -> Set[String] + + # @rbs (Array[Grammar::TokenPattern] candidates) -> Grammar::TokenPattern? + def select_best_pslr_token: (Array[Grammar::TokenPattern] candidates) -> Grammar::TokenPattern? + # @rbs (Logger logger) -> void def validate_conflicts_within_threshold!: (Logger logger) -> void @@ -211,5 +244,35 @@ module Lrama # @rbs () -> void def clear_look_ahead_sets: () -> void + + # Build Scanner FSA from token patterns + # @rbs () -> void + def build_scanner_fsa: () -> void + + # Build length precedences table + # @rbs () -> void + def build_length_precedences: () -> void + + # Build scanner_accepts table + # @rbs () -> void + def build_scanner_accepts: () -> void + + # Handle PSLR inadequacies + # Detects and splits states where pseudo-scanner behavior differs + # @rbs () -> void + def handle_pslr_inadequacies: () -> void + + # @rbs () -> void + def finalize_pslr_metrics: () -> void + + # Detect PSLR inadequacies in isocore groups + # @rbs () -> Array[State::PslrInadequacy] + def detect_pslr_inadequacies: () -> Array[State::PslrInadequacy] + + # @rbs (Logger logger) -> void + def validate_pslr_inadequacies!: (Logger logger) -> void + + # @rbs (Logger logger) -> void + def validate_pslr_state_growth!: (Logger logger) -> void end end diff --git a/spec/fixtures/command/pslr_growth_limit.y b/spec/fixtures/command/pslr_growth_limit.y new file mode 100644 index 00000000..ff5cb50a --- /dev/null +++ b/spec/fixtures/command/pslr_growth_limit.y @@ -0,0 +1,36 @@ +%define lr.type pslr + +%token-pattern P /p/ +%token-pattern Q /q/ +%token-pattern X /x/ +%token-pattern IF /if/ +%token-pattern ID /[a-z]+/ + +%lex-prec IF - ID + +%% + +program + : kw_context + | id_context + ; + +kw_context + : P shared IF + ; + +id_context + : Q shared ID + ; + +shared + : n1 + ; + +n1 + : n2 + ; + +n2 + : X + ; diff --git a/spec/fixtures/command/pslr_pure_reduce.y b/spec/fixtures/command/pslr_pure_reduce.y new file mode 100644 index 00000000..89ede00e --- /dev/null +++ b/spec/fixtures/command/pslr_pure_reduce.y @@ -0,0 +1,26 @@ +%define lr.type pslr + +%token-pattern RSHIFT />>/ +%token-pattern RANGLE />/ +%token-pattern ID /[a-z]+/ + +%lex-prec RANGLE -s RSHIFT + +%% + +program + : templ + | rshift_expr + ; + +templ + : a RANGLE + ; + +rshift_expr + : a RSHIFT ID + ; + +a + : ID + ; diff --git a/spec/fixtures/integration/pslr_context.l b/spec/fixtures/integration/pslr_context.l new file mode 100644 index 00000000..06cbd121 --- /dev/null +++ b/spec/fixtures/integration/pslr_context.l @@ -0,0 +1,50 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include +#include "pslr_context.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +ID_PATTERN [a-zA-Z_][a-zA-Z0-9_]* + +%% + +{ID_PATTERN} { + (void)yylval; + return ID; +} + +"<" { + return LANGLE; +} + +[>]{1,2} { + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = (yyleng == 2) ? RSHIFT : RANGLE; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +[[:space:]]+ {} + +<> { + return YYEOF; +} + +. { + fprintf(stderr, "Illegal character '%s'\n", yytext); + return YYUNDEF; +} + +%% diff --git a/spec/fixtures/integration/pslr_context.y b/spec/fixtures/integration/pslr_context.y new file mode 100644 index 00000000..9754e435 --- /dev/null +++ b/spec/fixtures/integration/pslr_context.y @@ -0,0 +1,75 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_context.h" +#include "pslr_context-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern RSHIFT />>/ +%token-pattern RANGLE />/ +%token-pattern LANGLE / +#include "pslr_keyword_context.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +%% + +[a-z]+ { + (void)yylval; + + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = ID; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +[[:space:]]+ {} + +<> { + return YYEOF; +} + +. { + fprintf(stderr, "Illegal character '%s'\n", yytext); + return YYUNDEF; +} + +%% diff --git a/spec/fixtures/integration/pslr_keyword_context.y b/spec/fixtures/integration/pslr_keyword_context.y new file mode 100644 index 00000000..e8effdf3 --- /dev/null +++ b/spec/fixtures/integration/pslr_keyword_context.y @@ -0,0 +1,87 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_keyword_context.h" +#include "pslr_keyword_context-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern P /p/ +%token-pattern Q /q/ +%token-pattern X /x/ +%token-pattern IF /if/ +%token-pattern ID /[a-z]+/ + +%lex-prec IF - ID + +%% + +program + : kw_context { printf("kw\n"); } + | id_context { printf("id\n"); } + ; + +kw_context + : P shared IF + ; + +id_context + : Q shared ID + ; + +shared + : n1 + ; + +n1 + : n2 + ; + +n2 + : X + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse(¶ms)) { + fprintf(stderr, "syntax error\n"); + return 1; + } + + return 0; +} diff --git a/spec/fixtures/integration/pslr_mixed_context.l b/spec/fixtures/integration/pslr_mixed_context.l new file mode 100644 index 00000000..5c75c7a3 --- /dev/null +++ b/spec/fixtures/integration/pslr_mixed_context.l @@ -0,0 +1,69 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include +#include "pslr_mixed_context.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +%% + +[a-z]+ { + (void)yylval; + + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = ID; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +"<" { + return LT; +} + +"@" { + return START; +} + +"#" { + return MARK; +} + +[>]{1,2} { + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = (yyleng == 2) ? RSHIFT : RANGLE; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +[[:space:]]+ {} + +<> { + return YYEOF; +} + +. { + fprintf(stderr, "Illegal character '%s'\n", yytext); + return YYUNDEF; +} + +%% diff --git a/spec/fixtures/integration/pslr_mixed_context.y b/spec/fixtures/integration/pslr_mixed_context.y new file mode 100644 index 00000000..090b180f --- /dev/null +++ b/spec/fixtures/integration/pslr_mixed_context.y @@ -0,0 +1,102 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_mixed_context.h" +#include "pslr_mixed_context-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern LT />/ +%token-pattern RANGLE />/ + +%lex-prec IF - ID +%lex-prec RANGLE -s RSHIFT + +%% + +program + : kw_context { printf("kw\n"); } + | id_context { printf("id\n"); } + | template_expr { printf("template\n"); } + | shift_expr { printf("shift\n"); } + ; + +kw_context + : P shared IF + ; + +id_context + : Q shared ID + ; + +template_expr + : LT shared RANGLE + ; + +shift_expr + : START shared RSHIFT ID + ; + +shared + : n1 + ; + +n1 + : n2 + ; + +n2 + : MARK + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse(¶ms)) { + fprintf(stderr, "syntax error\n"); + return 1; + } + + return 0; +} diff --git a/spec/fixtures/integration/pslr_shift_chain.l b/spec/fixtures/integration/pslr_shift_chain.l new file mode 100644 index 00000000..885ba62e --- /dev/null +++ b/spec/fixtures/integration/pslr_shift_chain.l @@ -0,0 +1,58 @@ +%option noinput nounput noyywrap never-interactive + +%{ +#include +#include "pslr_shift_chain.h" + +#define YY_DECL int yylex(YYSTYPE *yylval, struct parse_params *p) +%} + +ID_PATTERN [a-z]+ + +%% + +{ID_PATTERN} { + (void)yylval; + return ID; +} + +"<" { + return LT; +} + +"@" { + return START; +} + +"#" { + return MARK; +} + +[>]{1,2} { + int match_length = 0; + int token = YYPSLR_PSEUDO_SCAN(p, yytext, &match_length); + + if (token == YYEMPTY) { + token = (yyleng == 2) ? RSHIFT : RANGLE; + match_length = yyleng; + } + + if (0 < match_length && match_length < yyleng) { + yyless(match_length); + } + + return token; +} + +[[:space:]]+ {} + +<> { + return YYEOF; +} + +. { + fprintf(stderr, "Illegal character '%s'\n", yytext); + return YYUNDEF; +} + +%% diff --git a/spec/fixtures/integration/pslr_shift_chain.y b/spec/fixtures/integration/pslr_shift_chain.y new file mode 100644 index 00000000..d8f934fc --- /dev/null +++ b/spec/fixtures/integration/pslr_shift_chain.y @@ -0,0 +1,88 @@ +%{ +#include + +#define YY_DECL int yylex(YYSTYPE *lval, struct parse_params *p) + +#include "pslr_shift_chain.h" +#include "pslr_shift_chain-lexer.h" + +extern int yylex(YYSTYPE *lval, struct parse_params *p); +static int yyerror(YYLTYPE *loc, struct parse_params *p, const char *str); +%} + +%code requires { + struct parse_params { + int current_state; + }; +} + +%define api.pure +%define lr.type pslr +%define api.pslr.state-member current_state + +%lex-param {struct parse_params *p} +%parse-param {struct parse_params *p} + +%token-pattern LT />/ +%token-pattern RANGLE />/ +%token-pattern ID /[a-z]+/ + +%lex-prec RANGLE -s RSHIFT + +%% + +program + : template_expr { printf("template\n"); } + | shift_expr { printf("shift\n"); } + ; + +template_expr + : LT shared RANGLE + ; + +shift_expr + : START shared RSHIFT ID + ; + +shared + : n1 + ; + +n1 + : n2 + ; + +n2 + : MARK + ; + +%% + +static int +yyerror(YYLTYPE *loc, struct parse_params *p, const char *str) +{ + (void)loc; + (void)p; + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int +main(int argc, char *argv[]) +{ + struct parse_params params = { 0 }; + + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse(¶ms)) { + fprintf(stderr, "syntax error\n"); + return 1; + } + + return 0; +} diff --git a/spec/lrama/command_spec.rb b/spec/lrama/command_spec.rb index 58069e4a..ce30eccf 100644 --- a/spec/lrama/command_spec.rb +++ b/spec/lrama/command_spec.rb @@ -81,5 +81,105 @@ File.delete("report.output") end end + + context "when a PSLR grammar needs pure-reduce lookahead to choose tokens" do + let(:outfile) { File.join(Dir.tmpdir, "pslr-pure-reduce.c") } + + before do + File.delete(outfile) if File.exist?(outfile) + end + + after do + File.delete(outfile) if File.exist?(outfile) + end + + it "emits parser output successfully" do + command = Lrama::Command.new(["-o", outfile, fixture_path("command/pslr_pure_reduce.y")]) + + expect(command.run).to be_nil + expect(File).to exist(outfile) + end + end + + context "when validation aborts" do + let(:outfile) { File.join(Dir.tmpdir, "validate-abort.c") } + + before do + File.delete(outfile) if File.exist?(outfile) + end + + after do + File.delete(outfile) if File.exist?(outfile) + end + + it "fails before writing parser output" do + allow_any_instance_of(Lrama::States).to receive(:validate!).and_raise(SystemExit) + + command = Lrama::Command.new(["-o", outfile, fixture_path("command/basic.y")]) + + expect { command.run }.to raise_error(SystemExit) + expect(File).not_to exist(outfile) + end + end + + context "when a PSLR grammar exceeds the configured state limit" do + let(:outfile) { File.join(Dir.tmpdir, "pslr-growth-limit.c") } + + before do + File.delete(outfile) if File.exist?(outfile) + end + + after do + File.delete(outfile) if File.exist?(outfile) + end + + it "fails before writing parser output" do + command = Lrama::Command.new([ + "-Dpslr.max-states=5", + "-o", outfile, + fixture_path("command/pslr_growth_limit.y") + ]) + + expect do + begin + command.run + rescue SystemExit + nil + end + end.to output(/error: PSLR state growth exceeded pslr.max-states=5/).to_stderr_from_any_process + + expect(File).not_to exist(outfile) + end + end + + context "when PSLR report output is requested" do + let(:outfile) { File.join(Dir.tmpdir, "pslr-report.c") } + let(:report_file) { File.join(Dir.tmpdir, "pslr-report.output") } + + before do + File.delete(outfile) if File.exist?(outfile) + File.delete(report_file) if File.exist?(report_file) + end + + after do + File.delete(outfile) if File.exist?(outfile) + File.delete(report_file) if File.exist?(report_file) + end + + it "writes PSLR metrics into the report file" do + command = Lrama::Command.new([ + "--report=pslr", + "--report-file=#{report_file}", + "-o", outfile, + fixture_path("command/pslr_growth_limit.y") + ]) + + expect(command.run).to be_nil + report = File.read(report_file) + expect(report).to include("PSLR Summary") + expect(report).to include("Base states:") + expect(report).to include("Total states:") + end + end end end diff --git a/spec/lrama/grammar/lex_prec_spec.rb b/spec/lrama/grammar/lex_prec_spec.rb new file mode 100644 index 00000000..59e1ea6b --- /dev/null +++ b/spec/lrama/grammar/lex_prec_spec.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::Grammar::LexPrec do + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + + it "stores lex-prec rules" do + left = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + right = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + + lex_prec.add_rule( + left_token: left, + operator: Lrama::Grammar::LexPrec::SHORTER, + right_token: right, + lineno: 1 + ) + + expect(lex_prec.rules.size).to eq(1) + expect(lex_prec.shorter_priority?("RANGLE", "RSHIFT")).to be true + expect(lex_prec.shorter_priority?("RSHIFT", "RANGLE")).to be false + end + + it "handles higher priority rules" do + left = Lrama::Lexer::Token::Ident.new(s_value: "IF") + right = Lrama::Lexer::Token::Ident.new(s_value: "ID") + + lex_prec.add_rule( + left_token: left, + operator: Lrama::Grammar::LexPrec::HIGHER, + right_token: right, + lineno: 1 + ) + + expect(lex_prec.higher_priority?("IF", "ID")).to be true + expect(lex_prec.higher_priority?("ID", "IF")).to be false + end + + it "handles same priority (lex-tie) rules" do + left = Lrama::Lexer::Token::Ident.new(s_value: "TOKEN_A") + right = Lrama::Lexer::Token::Ident.new(s_value: "TOKEN_B") + + lex_prec.add_rule( + left_token: left, + operator: Lrama::Grammar::LexPrec::SAME_PRIORITY, + right_token: right, + lineno: 1 + ) + + expect(lex_prec.same_priority?("TOKEN_A", "TOKEN_B")).to be true + expect(lex_prec.same_priority?("TOKEN_B", "TOKEN_A")).to be true + end +end diff --git a/spec/lrama/grammar/token_pattern_spec.rb b/spec/lrama/grammar/token_pattern_spec.rb new file mode 100644 index 00000000..5d4c95da --- /dev/null +++ b/spec/lrama/grammar/token_pattern_spec.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::Grammar::TokenPattern do + it "stores token pattern information" do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + pattern = Lrama::Lexer::Token::Regex.new(s_value: "/>>>/") + + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: pattern, + alias_name: "right shift", + tag: nil, + lineno: 1, + definition_order: 0 + ) + + expect(token_pattern.name).to eq("RSHIFT") + expect(token_pattern.regex_pattern).to eq(">>>") + expect(token_pattern.alias_name).to eq("right shift") + expect(token_pattern.definition_order).to eq(0) + end +end diff --git a/spec/lrama/grammar_spec.rb b/spec/lrama/grammar_spec.rb index 3be8eab4..d551467e 100644 --- a/spec/lrama/grammar_spec.rb +++ b/spec/lrama/grammar_spec.rb @@ -242,5 +242,47 @@ end end end + + context 'when PSLR state member is not a valid C identifier' do + before do + grammar.define = { + 'lr.type' => 'pslr', + 'api.pslr.state-member' => 'current-state' + } + end + + it 'raises an error with the invalid member name' do + expect { grammar.validate! } + .to raise_error(RuntimeError, '%define api.pslr.state-member must be a valid C identifier, got "current-state".') + end + end + + context 'when PSLR max states is not an integer' do + before do + grammar.define = { + 'lr.type' => 'pslr', + 'pslr.max-states' => 'many' + } + end + + it 'raises an error with the invalid value' do + expect { grammar.validate! } + .to raise_error(RuntimeError, '%define pslr.max-states must be an integer, got "many".') + end + end + + context 'when PSLR max state ratio is smaller than one' do + before do + grammar.define = { + 'lr.type' => 'pslr', + 'pslr.max-state-ratio' => '0.5' + } + end + + it 'raises an error with the invalid ratio' do + expect { grammar.validate! } + .to raise_error(RuntimeError, '%define pslr.max-state-ratio must be greater than or equal to 1.0, got "0.5".') + end + end end end diff --git a/spec/lrama/integration_spec.rb b/spec/lrama/integration_spec.rb index c0bba17f..99dd4439 100644 --- a/spec/lrama/integration_spec.rb +++ b/spec/lrama/integration_spec.rb @@ -130,6 +130,33 @@ def generate_object(grammar_file_path, c_path, obj_path, command_args: []) end end + describe "PSLR parser and lexer integration" do + it "selects the longer token in shift contexts" do + test_parser("pslr_context", "foo>>bar", "shift\n") + end + + it "can prefer the shorter match in template contexts" do + test_parser("pslr_context", "foo>", "template\n") + end + + it "splits keyword and identifier contexts with the same lexeme" do + test_parser("pslr_keyword_context", "p x if", "kw\n") + test_parser("pslr_keyword_context", "q x if", "id\n") + end + + it "keeps chained shift and template contexts distinct" do + test_parser("pslr_shift_chain", "< # >", "template\n") + test_parser("pslr_shift_chain", "@ # >> foo", "shift\n") + end + + it "handles mixed keyword, identifier, template, and shift contexts in one grammar" do + test_parser("pslr_mixed_context", "p # if", "kw\n") + test_parser("pslr_mixed_context", "q # if", "id\n") + test_parser("pslr_mixed_context", "< # >", "template\n") + test_parser("pslr_mixed_context", "@ # >> foo", "shift\n") + end + end + describe "user defined parameterized rules" do it "prints messages corresponding to rules" do expected = <<~STR @@ -307,4 +334,232 @@ def generate_object(grammar_file_path, c_path, obj_path, command_args: []) end end end + + describe "PSLR context-dependent lexing" do + describe "Scanner FSA with overlapping patterns" do + let(:rangle) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:rshift) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 1 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([rangle, rshift]) } + + it "recognizes both RANGLE and RSHIFT as possible matches for '>>'" do + results = scanner_fsa.scan(">>") + + token_names = results.map { |r| r[:token].name } + expect(token_names).to include("RANGLE") + expect(token_names).to include("RSHIFT") + end + + it "RANGLE matches at position 1, RSHIFT matches at position 2" do + results = scanner_fsa.scan(">>") + + rangle_match = results.find { |r| r[:token].name == "RANGLE" } + rshift_match = results.find { |r| r[:token].name == "RSHIFT" } + + expect(rangle_match[:position]).to eq(1) + expect(rshift_match[:position]).to eq(2) + end + end + + describe "Length precedence resolution" do + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + + before do + left = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + right = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + lex_prec.add_rule( + left_token: left, + operator: Lrama::Grammar::LexPrec::SHORTER, + right_token: right, + lineno: 1 + ) + end + + let(:length_prec) { Lrama::LengthPrecedences.new(lex_prec) } + + it "indicates RANGLE (shorter) should be preferred over RSHIFT (longer)" do + expect(length_prec.prefer_shorter?("RANGLE", "RSHIFT")).to be true + end + + it "returns :left precedence for RANGLE vs RSHIFT" do + expect(length_prec.precedence("RANGLE", "RSHIFT")).to eq(:left) + end + end + + describe "Keyword vs identifier precedence" do + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + + before do + left = Lrama::Lexer::Token::Ident.new(s_value: "IF") + right = Lrama::Lexer::Token::Ident.new(s_value: "ID") + lex_prec.add_rule( + left_token: left, + operator: Lrama::Grammar::LexPrec::HIGHER, + right_token: right, + lineno: 1 + ) + end + + it "indicates IF has higher priority than ID" do + expect(lex_prec.higher_priority?("IF", "ID")).to be true + end + + it "indicates ID does not have higher priority than IF" do + expect(lex_prec.higher_priority?("ID", "IF")).to be false + end + end + + describe "Full PSLR grammar compilation" do + let(:grammar_text) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern RSHIFT />>/ "right shift" + %token-pattern RANGLE />/ "right angle" + %token-pattern LANGLE />") + token_names = results.map { |r| r[:token].name } + + expect(token_names).to include("RANGLE") + expect(token_names).to include("RSHIFT") + end + + describe "context-dependent token selection" do + it "scanner_accepts table is built" do + expect(states.scanner_accepts_table).not_to be_nil + end + + it "different parser states may accept different tokens for same FSA state" do + scanner_accepts = states.scanner_accepts_table + scanner_fsa = states.scanner_fsa + + results = scanner_fsa.scan(">>") + rshift_result = results.find { |r| r[:token].name == "RSHIFT" } + rangle_result = results.find { |r| r[:token].name == "RANGLE" } + + expect(rshift_result).not_to be_nil + expect(rangle_result).not_to be_nil + expect(scanner_accepts.table).to be_a(Hash) + end + end + + describe "generated C code output" do + let(:out) { StringIO.new } + let(:context) { Lrama::Context.new(states) } + let(:output) do + Lrama::Output.new( + out: out, + output_file_path: "pslr_test.c", + template_name: "bison/yacc.c", + grammar_file_path: "pslr_test.y", + context: context, + grammar: grammar + ) + end + + before do + output.render + out.rewind + end + + let(:rendered) { out.read } + + it "includes yy_scanner_transition table" do + expect(rendered).to include("yy_scanner_transition") + expect(rendered).to include("YY_SCANNER_NUM_STATES") + end + + it "includes yy_state_to_accepting mapping" do + expect(rendered).to include("yy_state_to_accepting") + expect(rendered).to include("YY_ACCEPTING_NONE") + end + + it "includes yy_length_precedences table" do + expect(rendered).to include("yy_length_precedences") + expect(rendered).to include("YY_LENGTH_PREC_LEFT") + end + + it "includes yy_pseudo_scan function" do + expect(rendered).to include("yy_pseudo_scan") + expect(rendered).to include("parser_state") + expect(rendered).to include("match_length") + end + + it "pseudo_scan function uses length precedences for token selection" do + expect(rendered).to include("yy_length_precedences[pbest][pattern_index]") + end + end + end + end end diff --git a/spec/lrama/length_precedences_spec.rb b/spec/lrama/length_precedences_spec.rb new file mode 100644 index 00000000..283a7d65 --- /dev/null +++ b/spec/lrama/length_precedences_spec.rb @@ -0,0 +1,76 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::LengthPrecedences do + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + + describe "#precedence" do + it "returns :undefined when no rule exists" do + length_prec = Lrama::LengthPrecedences.new(lex_prec) + expect(length_prec.precedence("TOKEN_A", "TOKEN_B")).to eq(:undefined) + end + + it "returns :left when shorter token should be preferred" do + left_token = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + right_token = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + lex_prec.add_rule( + left_token: left_token, + operator: Lrama::Grammar::LexPrec::SHORTER, + right_token: right_token, + lineno: 1 + ) + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.precedence("RANGLE", "RSHIFT")).to eq(:left) + end + + it "returns :right for the inverse relationship" do + left_token = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + right_token = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + lex_prec.add_rule( + left_token: left_token, + operator: Lrama::Grammar::LexPrec::SHORTER, + right_token: right_token, + lineno: 1 + ) + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.precedence("RSHIFT", "RANGLE")).to eq(:right) + end + end + + describe "#prefer_shorter?" do + it "returns true when shorter token should be preferred" do + left_token = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + right_token = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + lex_prec.add_rule( + left_token: left_token, + operator: Lrama::Grammar::LexPrec::SHORTER, + right_token: right_token, + lineno: 1 + ) + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.prefer_shorter?("RANGLE", "RSHIFT")).to be true + end + + it "returns false when no preference exists" do + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.prefer_shorter?("TOKEN_A", "TOKEN_B")).to be false + end + + it "returns false for inverse relationship" do + left_token = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + right_token = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + lex_prec.add_rule( + left_token: left_token, + operator: Lrama::Grammar::LexPrec::SHORTER, + right_token: right_token, + lineno: 1 + ) + length_prec = Lrama::LengthPrecedences.new(lex_prec) + + expect(length_prec.prefer_shorter?("RSHIFT", "RANGLE")).to be false + end + end +end diff --git a/spec/lrama/lexer/token/regex_spec.rb b/spec/lrama/lexer/token/regex_spec.rb new file mode 100644 index 00000000..bc4fb38e --- /dev/null +++ b/spec/lrama/lexer/token/regex_spec.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::Lexer::Token::Regex do + describe "#pattern" do + it "returns the pattern without surrounding slashes" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>>/") + expect(regex.pattern).to eq(">>>") + end + + it "handles character class patterns" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[a-zA-Z_][a-zA-Z0-9_]*/") + expect(regex.pattern).to eq("[a-zA-Z_][a-zA-Z0-9_]*") + end + + it "handles escape sequences" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/\\+/") + expect(regex.pattern).to eq("\\+") + end + + it "handles empty pattern" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "//") + expect(regex.pattern).to eq("") + end + + it "handles single character pattern" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + expect(regex.pattern).to eq(">") + end + end + + describe "#s_value" do + it "returns the original value including slashes" do + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>>/") + expect(regex.s_value).to eq("/>>>/") + end + end +end diff --git a/spec/lrama/option_parser_spec.rb b/spec/lrama/option_parser_spec.rb index 7675c4da..0efaa900 100644 --- a/spec/lrama/option_parser_spec.rb +++ b/spec/lrama/option_parser_spec.rb @@ -80,6 +80,7 @@ lookaheads explicitly associate lookahead tokens to items solved describe shift/reduce conflicts solving counterexamples, cex generate conflict counterexamples + pslr report PSLR split and scanner metrics rules list unused rules terms list unused terminals verbose report detailed internal state and analysis results @@ -128,12 +129,19 @@ end end + context "when pslr is passed" do + it "returns option hash pslr flag enabled" do + opts = option_parser.send(:validate_report, ["pslr"]) + expect(opts).to eq({grammar: true, pslr: true}) + end + end + context "when all is passed" do it "returns option hash all flags enabled" do opts = option_parser.send(:validate_report, ["all"]) expect(opts).to eq({ grammar: true, states: true, itemsets: true, - lookaheads: true, solved: true, counterexamples: true, + lookaheads: true, solved: true, counterexamples: true, pslr: true, rules: true, terms: true, verbose: true }) end diff --git a/spec/lrama/output_spec.rb b/spec/lrama/output_spec.rb index 5fa1d04f..aa905484 100644 --- a/spec/lrama/output_spec.rb +++ b/spec/lrama/output_spec.rb @@ -203,4 +203,228 @@ end end end + + describe "PSLR methods" do + let(:token_pattern) do + id = Lrama::Lexer::Token::Ident.new(s_value: "ID") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[a-z]+/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([token_pattern]) } + + let(:mock_states) do + instance_double( + Lrama::States, + scanner_fsa: scanner_fsa, + scanner_accepts_table: nil, + length_precedences: nil, + token_patterns: [token_pattern], + states: [], + find_symbol_by_s_value!: instance_double(Lrama::Grammar::Symbol, token_id: 301) + ) + end + + let(:mock_context) do + instance_double(Lrama::Context, states: mock_states) + end + + let(:mock_grammar) do + instance_double( + Lrama::Grammar, + eof_symbol: nil, + error_symbol: nil, + undef_symbol: nil, + accept_symbol: nil, + locations: false, + parse_param: "struct parse_params *p", + lex_param: "struct parse_params *p", + pslr_defined?: true, + pslr_state_member: "current_state" + ) + end + + let(:pslr_output) do + out = StringIO.new + Lrama::Output.new( + out: out, + output_file_path: "test.c", + template_name: "bison/yacc.c", + grammar_file_path: "test.y", + context: mock_context, + grammar: mock_grammar + ) + end + + describe "#pslr_enabled?" do + it "returns true when grammar requested PSLR output" do + expect(pslr_output.pslr_enabled?).to be true + end + + it "returns false when grammar did not request PSLR output" do + allow(mock_grammar).to receive(:pslr_defined?).and_return(false) + expect(pslr_output.pslr_enabled?).to be false + end + end + + describe "#pslr_scanner_enabled?" do + it "returns true when scanner FSA is built with states" do + expect(pslr_output.pslr_scanner_enabled?).to be true + end + + it "returns false when scanner FSA is nil" do + allow(mock_states).to receive(:scanner_fsa).and_return(nil) + expect(pslr_output.pslr_scanner_enabled?).to be false + end + + it "returns false when scanner FSA has no states" do + empty_fsa = Lrama::ScannerFSA.new([]) + allow(mock_states).to receive(:scanner_fsa).and_return(empty_fsa) + expect(pslr_output.pslr_scanner_enabled?).to be false + end + end + + describe "#scanner_transition_table" do + it "generates C code for scanner transitions" do + result = pslr_output.scanner_transition_table + expect(result).to include("YY_SCANNER_NUM_STATES") + expect(result).to include("yy_scanner_transition") + end + end + + describe "#pseudo_scan_function" do + it "generates the pseudo_scan C function" do + result = pslr_output.pseudo_scan_function + expect(result).to include("yy_pseudo_scan") + expect(result).to include("parser_state") + expect(result).to include("match_length") + expect(result).to include("yy_token_pattern_to_token_id") + end + end + + describe "#pslr_tables_and_functions" do + it "generates all PSLR C code" do + result = pslr_output.pslr_tables_and_functions + expect(result).to include("PSLR(1) Scanner Tables and Functions") + expect(result).to include("YY_SCANNER_NUM_STATES") + expect(result).to include("yy_scanner_transition") + expect(result).to include("yy_pseudo_scan") + expect(result).to include("yy_token_pattern_to_token_id") + end + end + + describe "#state_to_accepting_table" do + it "generates state to accepting mapping" do + result = pslr_output.state_to_accepting_table + expect(result).to include("yy_state_to_accepting") + expect(result).to include("YY_ACCEPTING_NONE") + end + end + + describe "#pslr_function_declarations" do + it "declares the PSLR helper entry points" do + result = pslr_output.pslr_function_declarations + expect(result).to include("int yy_state_accepts_token") + expect(result).to include("int yy_pseudo_scan") + expect(result).to include("YYPSLR_PSEUDO_SCAN_STATE") + expect(result).to include("YYPSLR_PSEUDO_SCAN") + expect(result).to include("YYSETSTATE_CONTEXT(CurrentState)") + end + end + + describe "#length_precedences_table_code" do + let(:mock_length_prec) { Lrama::LengthPrecedences.new(Lrama::Grammar::LexPrec.new) } + + before do + allow(mock_states).to receive(:length_precedences).and_return(mock_length_prec) + end + + it "generates length precedences table" do + result = pslr_output.length_precedences_table_code + expect(result).to include("length_precedences") + expect(result).to include("YY_LENGTH_PREC_UNDEFINED") + end + end + + describe "#accepting_tokens_table" do + it "generates accepting tokens information" do + result = pslr_output.accepting_tokens_table + expect(result).to include("Accepting state token IDs") + end + end + end + + describe "PSLR integration in render" do + let(:pslr_grammar_text) do + <<~GRAMMAR + %code requires { + struct parse_params { + int current_state; + }; + } + %define lr.type pslr + %define api.pslr.state-member current_state + %parse-param {struct parse_params *p} + %lex-param {struct parse_params *p} + %token-pattern RSHIFT />>/ "right shift" + %token-pattern RANGLE />/ "right angle" + %lex-prec RANGLE -s RSHIFT + %% + program: RSHIFT | RANGLE + GRAMMAR + end + + let(:pslr_grammar) do + grammar = Lrama::Parser.new(pslr_grammar_text, "pslr_test.y").parse + grammar.prepare + grammar.validate! + grammar + end + + let(:pslr_states) do + s = Lrama::States.new(pslr_grammar, Lrama::Tracer.new(Lrama::Logger.new)) + s.compute + s.compute_pslr + s + end + + let(:pslr_context) { Lrama::Context.new(pslr_states) } + let(:pslr_out) { StringIO.new } + + let(:pslr_full_output) do + Lrama::Output.new( + out: pslr_out, + output_file_path: "pslr_test.c", + template_name: "bison/yacc.c", + grammar_file_path: "pslr_test.y", + header_out: header_out, + header_file_path: "pslr_test.h", + context: pslr_context, + grammar: pslr_grammar + ) + end + + it "includes PSLR tables in rendered output" do + pslr_full_output.render + pslr_out.rewind + header_out.rewind + rendered = pslr_out.read + rendered_header = header_out.read + + expect(rendered).to include("PSLR(1) Scanner Tables and Functions") + expect(rendered).to include("YY_SCANNER_NUM_STATES") + expect(rendered).to include("yy_scanner_transition") + expect(rendered).to include("yy_pseudo_scan") + expect(rendered).to include("yy_token_pattern_to_token_id") + expect(rendered_header).to include("int yy_state_accepts_token") + expect(rendered_header).to include("int yy_pseudo_scan") + expect(rendered_header).to include("YYSETSTATE_CONTEXT(CurrentState)") + expect(rendered_header).to include("YYPSLR_PSEUDO_SCAN(Context, Input, MatchLength)") + end + end end diff --git a/spec/lrama/parser_spec.rb b/spec/lrama/parser_spec.rb index c2115ee2..7ca86b54 100644 --- a/spec/lrama/parser_spec.rb +++ b/spec/lrama/parser_spec.rb @@ -4598,4 +4598,130 @@ class : keyword_class tSTRING keyword_end end end end + + describe "PSLR directives" do + describe "%token-pattern" do + it "parses a single token pattern" do + y = <<~GRAMMAR + %token-pattern RSHIFT />>/ "right shift" + %% + program: RSHIFT + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.token_patterns.size).to eq(1) + token_pattern = grammar.token_patterns.first + expect(token_pattern.name).to eq("RSHIFT") + expect(token_pattern.regex_pattern).to eq(">>") + expect(token_pattern.alias_name).to eq("\"right shift\"") + end + + it "parses multiple token patterns" do + y = <<~GRAMMAR + %token-pattern RSHIFT />>/ "right shift" + %token-pattern RANGLE />/ "right angle" + %token-pattern LANGLE / RSHIFT />>/ "right shift" + %% + program: RSHIFT + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.token_patterns.size).to eq(1) + token_pattern = grammar.token_patterns.first + expect(token_pattern.tag.s_value).to eq("") + end + end + + describe "%lex-prec" do + it "parses shorter priority rule" do + y = <<~GRAMMAR + %token RANGLE RSHIFT + %lex-prec RANGLE -s RSHIFT + %% + program: RANGLE | RSHIFT + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.lex_prec.rules.size).to eq(1) + expect(grammar.lex_prec.shorter_priority?("RANGLE", "RSHIFT")).to be true + end + + it "parses higher priority rule" do + y = <<~GRAMMAR + %token IF ID + %lex-prec IF - ID + %% + program: IF | ID + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.lex_prec.rules.size).to eq(1) + expect(grammar.lex_prec.higher_priority?("IF", "ID")).to be true + end + + it "parses chained lex-prec rules" do + y = <<~GRAMMAR + %token IF ELSE WHILE ID + %lex-prec IF - ELSE - WHILE - ID + %% + program: IF | ELSE | WHILE | ID + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.lex_prec.rules.size).to eq(3) + expect(grammar.lex_prec.higher_priority?("IF", "ELSE")).to be true + expect(grammar.lex_prec.higher_priority?("ELSE", "WHILE")).to be true + expect(grammar.lex_prec.higher_priority?("WHILE", "ID")).to be true + end + end + + describe "%define lr.type pslr" do + it "recognizes pslr lr.type" do + y = <<~GRAMMAR + %define lr.type pslr + %token ID + %% + program: ID + GRAMMAR + + grammar = Lrama::Parser.new(y, "pslr_test.y").parse + grammar.prepare + grammar.validate! + + expect(grammar.pslr_defined?).to be true + end + end + end end diff --git a/spec/lrama/pslr_family_regressions_spec.rb b/spec/lrama/pslr_family_regressions_spec.rb new file mode 100644 index 00000000..59f3d3c1 --- /dev/null +++ b/spec/lrama/pslr_family_regressions_spec.rb @@ -0,0 +1,294 @@ +# frozen_string_literal: true + +RSpec.describe "PSLR family regressions" do + include PslrFamilyHelper + + describe "pure-reduce profile" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "states/pslr_pure_reduce.y") + %define lr.type pslr + %token-pattern RSHIFT />>/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-prec RANGLE -s RSHIFT + + %% + + program + : templ + | rshift_expr + ; + + templ + : a RANGLE + ; + + rshift_expr + : a RSHIFT ID + ; + + a + : ID + ; + GRAMMAR + end + + it "keeps pure reduce states scanner-compatible without forcing a split" do + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + reduce_state = pslr_states.states.find do |state| + state.reduces.any? { |reduce| reduce.rule.display_name == "a -> ID" } + end + + expect(pslr_states.states_count).to eq(ielr_states.states_count) + expect(pslr_states.pslr_inadequacies).to be_empty + expect(acceptable_tokens(pslr_states, reduce_state)).to contain_exactly("RANGLE", "RSHIFT") + end + end + + describe "chained keyword split" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "states/pslr_keyword_context.y") + %define lr.type pslr + %token-pattern P /p/ + %token-pattern Q /q/ + %token-pattern X /x/ + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %lex-prec IF - ID + + %% + + program + : kw_context + | id_context + ; + + kw_context + : P shared IF + ; + + id_context + : Q shared ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : X + ; + GRAMMAR + end + + it "splits every chained reduce state by scanner profile" do + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + reduce_states = pslr_states.states + .select { |state| state.reduces.any? } + .group_by { |state| state.reduces.first.rule.display_name } + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + + ["shared -> n1", "n1 -> n2", "n2 -> X"].each do |rule_name| + states_for_rule = reduce_states.fetch(rule_name) + token_sets = states_for_rule.map { |state| acceptable_tokens(pslr_states, state) } + + expect(states_for_rule.size).to eq(2) + expect(states_for_rule.count(&:split_state?)).to eq(1) + expect(token_sets.any? { |set| set.include?("IF") && !set.include?("ID") }).to be(true) + expect(token_sets.any? { |set| set.include?("ID") && !set.include?("IF") }).to be(true) + end + end + end + + describe "chained shift/angle split" do + let(:grammar) do + build_grammar(<<~GRAMMAR, "states/pslr_shift_chain.y") + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-prec RANGLE -s RSHIFT + + %% + + program + : template_expr + | shift_expr + ; + + template_expr + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : MARK + ; + GRAMMAR + end + + it "splits every chained reduce state by shift/angle scanner profile" do + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + reduce_states = pslr_states.states + .select { |state| state.reduces.any? } + .group_by { |state| state.reduces.first.rule.display_name } + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + + ["shared -> n1", "n1 -> n2", "n2 -> MARK"].each do |rule_name| + states_for_rule = reduce_states.fetch(rule_name) + token_sets = states_for_rule.map { |state| acceptable_tokens(pslr_states, state) } + + expect(states_for_rule.size).to eq(2) + expect(states_for_rule.count(&:split_state?)).to eq(1) + expect(token_sets.any? { |set| set.include?("RANGLE") && !set.include?("RSHIFT") }).to be(true) + expect(token_sets.any? { |set| set.include?("RSHIFT") && !set.include?("RANGLE") }).to be(true) + end + end + end + + describe "mixed families" do + { + "empty shared wrapper" => { + path: "states/pslr_mixed_empty.y", + grammar: <<~GRAMMAR, + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec IF - ID + %lex-prec RANGLE -s RSHIFT + + %% + + program + : kw + | ident + | templ + | shift_expr + ; + + kw + : P shared IF + ; + + ident + : Q shared ID + ; + + templ + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : opt n1 + ; + + opt + : + ; + + n1 + : MARK + ; + GRAMMAR + }, + "chain2 shared wrapper" => { + path: "states/pslr_mixed_chain2.y", + grammar: <<~GRAMMAR, + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec IF - ID + %lex-prec RANGLE -s RSHIFT + + %% + + program + : kw + | ident + | templ + | shift_expr + ; + + kw + : P shared IF + ; + + ident + : Q shared ID + ; + + templ + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : MARK + ; + GRAMMAR + } + }.each do |label, attrs| + it "keeps #{label} scanner-compatible" do + grammar = build_grammar(attrs[:grammar], attrs[:path]) + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + end + end + end +end diff --git a/spec/lrama/pslr_generated_families_spec.rb b/spec/lrama/pslr_generated_families_spec.rb new file mode 100644 index 00000000..795d25d6 --- /dev/null +++ b/spec/lrama/pslr_generated_families_spec.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +RSpec.describe "PSLR generated family coverage" do + include PslrFamilyHelper + + families = [ + { + label: "keyword/id", + builder: :keyword_context_source, + path_prefix: "generated/pslr_keyword_depth", + split_expected: true + }, + { + label: "shift/angle", + builder: :shift_angle_source, + path_prefix: "generated/pslr_shift_depth", + split_expected: true + }, + { + label: "mixed", + builder: :mixed_context_source, + path_prefix: "generated/pslr_mixed_depth", + split_expected: true + } + ].freeze + + families.each do |family| + (0..3).each do |depth| + it "#{family[:label]} depth=#{depth} keeps PSLR inadequacies resolved" do + grammar = build_grammar( + public_send(family[:builder], depth: depth), + "#{family[:path_prefix]}_#{depth}.y" + ) + ielr_states, pslr_states = compute_ielr_and_pslr(grammar) + + aggregate_failures do + expect(pslr_states.pslr_inadequacies).to be_empty + expect(pslr_states.states_count).to be >= ielr_states.states_count + expect(pslr_states.pslr_metrics[:growth_count]).to eq(pslr_states.states_count - pslr_states.pslr_metrics[:base_states_count]) + + next unless family[:split_expected] + next unless 1 <= depth + + expect(pslr_states.states_count).to be > ielr_states.states_count + end + end + end + end +end diff --git a/spec/lrama/scanner_fsa_spec.rb b/spec/lrama/scanner_fsa_spec.rb new file mode 100644 index 00000000..8ba20de8 --- /dev/null +++ b/spec/lrama/scanner_fsa_spec.rb @@ -0,0 +1,193 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::ScannerFSA do + describe "initialization" do + it "creates an empty FSA for no patterns" do + fsa = Lrama::ScannerFSA.new([]) + expect(fsa.states).to be_empty + end + + it "creates FSA for single literal pattern" do + id = Lrama::Lexer::Token::Ident.new(s_value: "PLUS") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/\\+/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + expect(fsa.states).not_to be_empty + expect(fsa.initial_state).not_to be_nil + end + end + + describe "#scan" do + it "matches a single character pattern" do + id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + results = fsa.scan(">") + expect(results.size).to eq(1) + expect(results[0][:token].name).to eq("RANGLE") + expect(results[0][:position]).to eq(1) + end + + it "matches a multi-character pattern" do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + results = fsa.scan(">>") + expect(results.size).to eq(1) + expect(results[0][:token].name).to eq("RSHIFT") + expect(results[0][:position]).to eq(2) + end + + it "returns multiple matches for overlapping patterns" do + rangle_id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + rangle_regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + rangle = Lrama::Grammar::TokenPattern.new( + id: rangle_id, + pattern: rangle_regex, + lineno: 1, + definition_order: 0 + ) + + rshift_id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + rshift_regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + rshift = Lrama::Grammar::TokenPattern.new( + id: rshift_id, + pattern: rshift_regex, + lineno: 1, + definition_order: 1 + ) + + fsa = Lrama::ScannerFSA.new([rangle, rshift]) + + results = fsa.scan(">>") + + # Should match both RANGLE at position 1 and RSHIFT at position 2 + expect(results.size).to eq(2) + positions = results.map { |r| [r[:token].name, r[:position]] } + expect(positions).to include(["RANGLE", 1]) + expect(positions).to include(["RSHIFT", 2]) + end + + it "matches character class patterns" do + id = Lrama::Lexer::Token::Ident.new(s_value: "ID") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[a-zA-Z_][a-zA-Z0-9_]*/") + id_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([id_pattern]) + + results = fsa.scan("hello_world123") + expect(results).not_to be_empty + # Should have matches at each position as the identifier grows + end + + it "matches digit patterns" do + id = Lrama::Lexer::Token::Ident.new(s_value: "INT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[0-9]+/") + int_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([int_pattern]) + + results = fsa.scan("12345") + expect(results).not_to be_empty + end + end + + describe "#acc_ss" do + it "returns empty array for non-accepting state" do + id = Lrama::Lexer::Token::Ident.new(s_value: "AB") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/ab/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + # Initial state shouldn't be accepting for non-empty pattern + tokens = fsa.acc_ss(0) + expect(tokens).to be_empty + end + + it "returns accepting tokens for accepting state" do + id = Lrama::Lexer::Token::Ident.new(s_value: "A") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/a/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + # Scan to reach accepting state + results = fsa.scan("a") + expect(results).not_to be_empty + + accepting_state = results[0][:state] + tokens = fsa.acc_ss(accepting_state.id) + expect(tokens.map(&:name)).to include("A") + end + end + + describe "#state_to_accepting_state" do + it "returns nil for non-accepting state" do + id = Lrama::Lexer::Token::Ident.new(s_value: "AB") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/ab/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + expect(fsa.state_to_accepting_state(0)).to be_nil + end + + it "returns the state itself for accepting state" do + id = Lrama::Lexer::Token::Ident.new(s_value: "A") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/a/") + token_pattern = Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + fsa = Lrama::ScannerFSA.new([token_pattern]) + + results = fsa.scan("a") + accepting_state = results[0][:state] + + expect(fsa.state_to_accepting_state(accepting_state.id)).to eq(accepting_state) + end + end +end diff --git a/spec/lrama/state/pslr_inadequacy_spec.rb b/spec/lrama/state/pslr_inadequacy_spec.rb new file mode 100644 index 00000000..cef7a5d6 --- /dev/null +++ b/spec/lrama/state/pslr_inadequacy_spec.rb @@ -0,0 +1,234 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::State::PslrInadequacy do + let(:mock_state) do + instance_double(Lrama::State, id: 0) + end + + let(:mock_conflicting_states) do + [ + instance_double(Lrama::State, id: 1), + instance_double(Lrama::State, id: 2) + ] + end + + describe "#initialize" do + it "creates an LR-relative inadequacy" do + inadequacy = Lrama::State::PslrInadequacy.new( + type: Lrama::State::PslrInadequacy::LR_RELATIVE, + state: mock_state, + conflicting_states: mock_conflicting_states, + details: { reason: "test" } + ) + + expect(inadequacy.type).to eq(:lr_relative) + expect(inadequacy.state).to eq(mock_state) + expect(inadequacy.conflicting_states).to eq(mock_conflicting_states) + expect(inadequacy.details[:reason]).to eq("test") + end + + it "creates a PSLR-relative inadequacy" do + inadequacy = Lrama::State::PslrInadequacy.new( + type: Lrama::State::PslrInadequacy::PSLR_RELATIVE, + state: mock_state, + conflicting_states: mock_conflicting_states, + details: {} + ) + + expect(inadequacy.type).to eq(:pslr_relative) + end + end + + describe "#to_s" do + it "returns a human-readable description" do + inadequacy = Lrama::State::PslrInadequacy.new( + type: Lrama::State::PslrInadequacy::PSLR_RELATIVE, + state: mock_state, + conflicting_states: mock_conflicting_states, + details: {} + ) + + expect(inadequacy.to_s).to include("PSLR Inadequacy") + expect(inadequacy.to_s).to include("pslr_relative") + expect(inadequacy.to_s).to include("state 0") + expect(inadequacy.to_s).to include("1, 2") + end + end + + describe "constants" do + it "defines LR_RELATIVE constant" do + expect(Lrama::State::PslrInadequacy::LR_RELATIVE).to eq(:lr_relative) + end + + it "defines PSLR_RELATIVE constant" do + expect(Lrama::State::PslrInadequacy::PSLR_RELATIVE).to eq(:pslr_relative) + end + end +end + +RSpec.describe Lrama::State::PslrCompatibilityChecker do + let(:rangle) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:rshift) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 1 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([rangle, rshift]) } + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + let(:length_prec) { Lrama::LengthPrecedences.new(lex_prec) } + let(:accepting_state_ids) { scanner_fsa.states.select(&:accepting?).map(&:id) } + let(:short_state_id) { accepting_state_ids.min } + let(:long_state_id) { accepting_state_ids.max } + + describe "#initialize" do + it "creates a compatibility checker" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + expect(checker).to be_a(Lrama::State::PslrCompatibilityChecker) + end + end + + describe "#compatible?" do + context "when both states select same tokens" do + it "returns true" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + allow(scanner_accepts).to receive(:[]).and_return(rangle) + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state1 = instance_double(Lrama::State, id: 0) + state2 = instance_double(Lrama::State, id: 1) + + expect(checker.compatible?(state1, state2, scanner_fsa)).to be true + end + end + + context "when both states have no tokens (nil)" do + it "returns true" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + allow(scanner_accepts).to receive(:[]).and_return(nil) + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state1 = instance_double(Lrama::State, id: 0) + state2 = instance_double(Lrama::State, id: 1) + + expect(checker.compatible?(state1, state2, scanner_fsa)).to be true + end + end + + context "when states select different tokens" do + it "returns false" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + + # State 0 selects RANGLE, State 1 selects RSHIFT + allow(scanner_accepts).to receive(:[]) do |state_id, _fsa_state_id| + if state_id == 0 + rangle + else + rshift + end + end + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state1 = instance_double(Lrama::State, id: 0) + state2 = instance_double(Lrama::State, id: 1) + + expect(checker.compatible?(state1, state2, scanner_fsa)).to be false + end + end + end + + describe "#profile" do + it "returns a stable accepting-state profile" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + allow(scanner_accepts).to receive(:[]) do |state_id, fsa_state_id| + if state_id == 0 + fsa_state_id == short_state_id ? rangle : rshift + else + fsa_state_id == short_state_id ? rangle : nil + end + end + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state = instance_double(Lrama::State, id: 0) + + expect(checker.profile(state, scanner_fsa)).to eq([ + [short_state_id, "RANGLE"], + [long_state_id, "RSHIFT"], + ]) + end + end + + describe "#group_by_profile" do + it "partitions states by scanner behavior" do + scanner_accepts = instance_double(Lrama::State::ScannerAccepts) + allow(scanner_accepts).to receive(:[]) do |state_id, fsa_state_id| + case [state_id, fsa_state_id] + when [0, short_state_id], [1, short_state_id] + rangle + when [0, long_state_id] + rshift + when [1, long_state_id] + nil + when [2, short_state_id] + rshift + when [2, long_state_id] + rshift + end + end + + checker = Lrama::State::PslrCompatibilityChecker.new( + scanner_accepts, + length_prec + ) + + state1 = instance_double(Lrama::State, id: 0) + state2 = instance_double(Lrama::State, id: 1) + state3 = instance_double(Lrama::State, id: 2) + + grouped = checker.group_by_profile([state1, state2, state3], scanner_fsa) + + expect(grouped.values.map(&:size)).to contain_exactly(1, 1, 1) + expect(grouped.keys).to include( + [[short_state_id, "RANGLE"], [long_state_id, "RSHIFT"]], + [[short_state_id, "RANGLE"], [long_state_id, nil]], + [[short_state_id, "RSHIFT"], [long_state_id, "RSHIFT"]], + ) + end + end +end diff --git a/spec/lrama/state/scanner_accepts_spec.rb b/spec/lrama/state/scanner_accepts_spec.rb new file mode 100644 index 00000000..6469c9e0 --- /dev/null +++ b/spec/lrama/state/scanner_accepts_spec.rb @@ -0,0 +1,200 @@ +# frozen_string_literal: true + +RSpec.describe Lrama::State::ScannerAccepts do + describe "#build and #[]" do + let(:rangle) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:rshift) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 1 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([rangle, rshift]) } + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + let(:length_prec) { Lrama::LengthPrecedences.new(lex_prec) } + + context "with mock parser states" do + let(:mock_symbol) do + instance_double( + Lrama::Grammar::Symbol, + term?: true, + id: instance_double(Lrama::Lexer::Token::Ident, s_value: "RANGLE") + ) + end + + let(:mock_shift) do + instance_double( + Lrama::State::Action::Shift, + next_sym: mock_symbol + ) + end + + let(:mock_state) do + instance_double( + Lrama::State, + id: 0, + term_transitions: [mock_shift], + reduces: [] + ) + end + + it "builds scanner_accepts table" do + scanner_accepts = Lrama::State::ScannerAccepts.new( + [mock_state], + scanner_fsa, + lex_prec, + length_prec + ) + scanner_accepts.build + + expect(scanner_accepts.table).to be_a(Hash) + end + end + end + + describe "token selection logic" do + let(:token_a) do + id = Lrama::Lexer::Token::Ident.new(s_value: "TOKEN_A") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/a/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:token_ab) do + id = Lrama::Lexer::Token::Ident.new(s_value: "TOKEN_AB") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/ab/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 1 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([token_a, token_ab]) } + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + let(:length_prec) { Lrama::LengthPrecedences.new(lex_prec) } + + it "creates FSA with accepting states" do + expect(scanner_fsa.states).not_to be_empty + end + end + + describe "priority selection with lex-prec rules" do + let(:if_token) do + id = Lrama::Lexer::Token::Ident.new(s_value: "IF") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/if/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:id_token) do + id = Lrama::Lexer::Token::Ident.new(s_value: "ID") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/[a-z]+/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 1 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([if_token, id_token]) } + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + + before do + left = Lrama::Lexer::Token::Ident.new(s_value: "IF") + right = Lrama::Lexer::Token::Ident.new(s_value: "ID") + lex_prec.add_rule( + left_token: left, + operator: Lrama::Grammar::LexPrec::HIGHER, + right_token: right, + lineno: 1 + ) + end + + it "respects higher priority rules" do + expect(lex_prec.higher_priority?("IF", "ID")).to be true + end + + it "creates length precedences from lex_prec" do + length_prec = Lrama::LengthPrecedences.new(lex_prec) + expect(length_prec).to be_a(Lrama::LengthPrecedences) + end + end + + describe "pure reduce states" do + let(:rangle) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RANGLE") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 0 + ) + end + + let(:rshift) do + id = Lrama::Lexer::Token::Ident.new(s_value: "RSHIFT") + regex = Lrama::Lexer::Token::Regex.new(s_value: "/>>/") + Lrama::Grammar::TokenPattern.new( + id: id, + pattern: regex, + lineno: 1, + definition_order: 1 + ) + end + + let(:scanner_fsa) { Lrama::ScannerFSA.new([rangle, rshift]) } + let(:lex_prec) { Lrama::Grammar::LexPrec.new } + let(:length_prec) { Lrama::LengthPrecedences.new(lex_prec) } + let(:reduce) { instance_double(Lrama::State::Action::Reduce) } + let(:parser_state) do + instance_double( + Lrama::State, + term_transitions: [], + reduces: [reduce], + ) + end + + it "uses propagated item lookaheads when explicit reduce lookahead is absent" do + allow(parser_state).to receive(:acceptable_pslr_reduce_lookahead).with(reduce).and_return([ + instance_double(Lrama::Grammar::Symbol, id: instance_double(Lrama::Lexer::Token::Ident, s_value: "RANGLE")), + instance_double(Lrama::Grammar::Symbol, id: instance_double(Lrama::Lexer::Token::Ident, s_value: "RSHIFT")), + ]) + + scanner_accepts = Lrama::State::ScannerAccepts.new( + [parser_state], + scanner_fsa, + lex_prec, + length_prec + ) + + expect(scanner_accepts.send(:compute_acc_sp, parser_state).to_a).to contain_exactly("RANGLE", "RSHIFT") + end + end +end diff --git a/spec/lrama/states_spec.rb b/spec/lrama/states_spec.rb index 28c217e2..d42bceb5 100644 --- a/spec/lrama/states_spec.rb +++ b/spec/lrama/states_spec.rb @@ -3158,5 +3158,493 @@ class : keyword_class tSTRING keyword_end %prec tPLUS expect(logger).not_to have_received(:error) end end + + context "when unresolved PSLR inadequacies remain" do + let(:header) do + <<~STR + %define lr.type pslr + %token-pattern RSHIFT />>/ + %token-pattern RANGLE />/ + %lex-prec RANGLE -s RSHIFT + + %% + + program: RSHIFT | RANGLE + STR + end + + it "fails fast instead of silently generating a parser" do + grammar = Lrama::Parser.new(header, "states/pslr_inadequacy.y").parse + grammar.prepare + grammar.validate! + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + states.instance_variable_set( + :@pslr_inadequacies, + [ + Lrama::State::PslrInadequacy.new( + type: Lrama::State::PslrInadequacy::PSLR_RELATIVE, + state: instance_double(Lrama::State, id: 3), + conflicting_states: [instance_double(Lrama::State, id: 3), instance_double(Lrama::State, id: 4)], + details: { reason: "Scanner behavior differs between isocore states" } + ) + ] + ) + logger = Lrama::Logger.new + allow(logger).to receive(:error) + + expect { states.validate!(logger) }.to raise_error(SystemExit) + expect(logger).to have_received(:error).with(include("PSLR Inadequacy")) + end + end + end + + describe "PSLR split helpers" do + let(:y) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern RSHIFT />>/ + %token-pattern RANGLE />/ + %lex-prec RANGLE -s RSHIFT + + %% + + program: RSHIFT | RANGLE + GRAMMAR + end + + let(:grammar) do + g = Lrama::Parser.new(y, "states/pslr_split.y").parse + g.prepare + g.validate! + g + end + + let(:states) { Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) } + let(:kernel_item) { instance_double(Lrama::State::Item, end_of_rule?: true) } + let(:reduce) { instance_double(Lrama::State::Action::Reduce, item: kernel_item, look_ahead: [grammar.find_symbol_by_s_value!("RSHIFT")]) } + let(:mock_state) do + instance_double( + Lrama::State, + is_compatible?: true, + kernels: [kernel_item], + term_transitions: [], + reduces: [reduce], + acceptable_reduce_lookahead: [grammar.find_symbol_by_s_value!("RSHIFT")], + acceptable_pslr_reduce_lookahead: [grammar.find_symbol_by_s_value!("RSHIFT")], + ) + end + + before do + states.instance_variable_set(:@scanner_fsa, Lrama::ScannerFSA.new(grammar.token_patterns)) + states.instance_variable_set(:@pslr_split_enabled, true) + end + + it "derives different PSLR signatures from different propagated lookaheads" do + current = states.send(:pslr_state_signature, mock_state) + filtered = states.send( + :pslr_state_signature, + mock_state, + { kernel_item => [grammar.find_symbol_by_s_value!("RANGLE")] }, + ) + + expect(current.map(&:last)).to include("RSHIFT") + expect(current.map(&:last)).not_to include("RANGLE") + expect(filtered.map(&:last)).to include("RANGLE") + expect(filtered.map(&:last)).not_to include("RSHIFT") + end + + it "treats states with different PSLR signatures as incompatible during splitting" do + filtered_lookaheads = { kernel_item => [grammar.find_symbol_by_s_value!("RANGLE")] } + expect(states.send(:compatible_split_state?, mock_state, filtered_lookaheads)).to be false + end + + it "detects unresolved PSLR inadequacies per transition" do + propagated = { kernel_item => [grammar.find_symbol_by_s_value!("RANGLE")] } + matching_state = instance_double(Lrama::State, id: 8) + next_state = instance_double(Lrama::State, id: 4) + transition_symbol = instance_double( + Lrama::Grammar::Symbol, + id: instance_double(Lrama::Lexer::Token::Ident, s_value: "RSHIFT"), + ) + transition = instance_double(Lrama::State::Action::Shift, to_state: next_state, next_sym: transition_symbol) + from_state = instance_double( + Lrama::State, + id: 1, + transitions: [transition], + propagate_lookaheads_without_filter: propagated, + ) + + allow(next_state).to receive(:lalr_isocore).and_return(next_state) + allow(next_state).to receive(:ielr_isocores).and_return([next_state, matching_state]) + allow(states).to receive(:pslr_state_signature).with(next_state, propagated).and_return([[1, "RANGLE"]]) + allow(states).to receive(:pslr_state_signature).with(next_state).and_return([[1, "RSHIFT"]]) + allow(states).to receive(:pslr_state_signature).with(matching_state).and_return([[1, "RANGLE"]]) + states.instance_variable_set(:@states, [from_state]) + + inadequacies = states.send(:detect_pslr_inadequacies) + + expect(inadequacies.size).to eq(1) + expect(inadequacies.first.details[:matching_state_id]).to eq(8) + expect(inadequacies.first.details[:transition_symbol]).to eq("RSHIFT") + end + + it "merges propagated lookaheads into an existing split state" do + current_lookaheads = { kernel_item => [grammar.find_symbol_by_s_value!("RSHIFT")] } + incoming_lookaheads = { kernel_item => [grammar.find_symbol_by_s_value!("RANGLE")] } + target_state = instance_double(Lrama::State, lookaheads_recomputed: true) + transition = instance_double(Lrama::State::Action::Shift, to_state: target_state) + split_state = instance_double( + Lrama::State, + kernels: [kernel_item], + item_lookahead_set: current_lookaheads, + transitions: [transition], + ) + + allow(split_state).to receive(:item_lookahead_set=) + + states.send(:merge_lookaheads, split_state, incoming_lookaheads) + + expect(split_state).to have_received(:item_lookahead_set=).with( + kernel_item => [grammar.find_symbol_by_s_value!("RSHIFT"), grammar.find_symbol_by_s_value!("RANGLE")], + ) + end + end + + describe "PSLR pure-reduce profile regression" do + let(:y) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern RSHIFT />>/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-prec RANGLE -s RSHIFT + + %% + + program + : templ + | rshift_expr + ; + + templ + : a RANGLE + ; + + rshift_expr + : a RSHIFT ID + ; + + a + : ID + ; + GRAMMAR + end + + let(:grammar) do + g = Lrama::Parser.new(y, "states/pslr_pure_reduce.y").parse + g.prepare + g.validate! + g + end + + it "keeps pure reduce states scanner-compatible without forcing a split" do + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + reduce_state = pslr_states.states.find do |state| + state.reduces.any? { |reduce| reduce.rule.display_name == "a -> ID" } + end + + expect(pslr_states.states_count).to eq(ielr_states.states_count) + expect(pslr_states.pslr_inadequacies).to be_empty + expect(pslr_states.send(:acceptable_tokens_for_pslr, reduce_state).to_a).to contain_exactly("RANGLE", "RSHIFT") + end + end + + describe "PSLR chained keyword split regression" do + let(:y) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern P /p/ + %token-pattern Q /q/ + %token-pattern X /x/ + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %lex-prec IF - ID + + %% + + program + : kw_context + | id_context + ; + + kw_context + : P shared IF + ; + + id_context + : Q shared ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : X + ; + GRAMMAR + end + + let(:grammar) do + g = Lrama::Parser.new(y, "states/pslr_keyword_context.y").parse + g.prepare + g.validate! + g + end + + it "splits every chained reduce state by scanner profile" do + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + reduce_states = pslr_states.states + .select { |state| state.reduces.any? } + .group_by { |state| state.reduces.first.rule.display_name } + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + + ["shared -> n1", "n1 -> n2", "n2 -> X"].each do |rule_name| + states_for_rule = reduce_states.fetch(rule_name) + token_sets = states_for_rule.map { |state| pslr_states.send(:acceptable_tokens_for_pslr, state) } + + expect(states_for_rule.size).to eq(2) + expect(states_for_rule.count(&:split_state?)).to eq(1) + expect(token_sets.any? { |set| set.include?("IF") && !set.include?("ID") }).to be(true) + expect(token_sets.any? { |set| set.include?("ID") && !set.include?("IF") }).to be(true) + end + end + end + + describe "PSLR chained shift/angle split regression" do + let(:y) do + <<~GRAMMAR + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-prec RANGLE -s RSHIFT + + %% + + program + : template_expr + | shift_expr + ; + + template_expr + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : MARK + ; + GRAMMAR + end + + let(:grammar) do + g = Lrama::Parser.new(y, "states/pslr_shift_chain.y").parse + g.prepare + g.validate! + g + end + + it "splits every chained reduce state by shift/angle scanner profile" do + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + reduce_states = pslr_states.states + .select { |state| state.reduces.any? } + .group_by { |state| state.reduces.first.rule.display_name } + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + + ["shared -> n1", "n1 -> n2", "n2 -> MARK"].each do |rule_name| + states_for_rule = reduce_states.fetch(rule_name) + token_sets = states_for_rule.map { |state| pslr_states.send(:acceptable_tokens_for_pslr, state) } + + expect(states_for_rule.size).to eq(2) + expect(states_for_rule.count(&:split_state?)).to eq(1) + expect(token_sets.any? { |set| set.include?("RANGLE") && !set.include?("RSHIFT") }).to be(true) + expect(token_sets.any? { |set| set.include?("RSHIFT") && !set.include?("RANGLE") }).to be(true) + end + end + end + + describe "PSLR mixed family regressions" do + { + "empty shared wrapper" => { + path: "states/pslr_mixed_empty.y", + grammar: <<~GRAMMAR, + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec IF - ID + %lex-prec RANGLE -s RSHIFT + + %% + + program + : kw + | ident + | templ + | shift_expr + ; + + kw + : P shared IF + ; + + ident + : Q shared ID + ; + + templ + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : opt n1 + ; + + opt + : + ; + + n1 + : MARK + ; + GRAMMAR + }, + "chain2 shared wrapper" => { + path: "states/pslr_mixed_chain2.y", + grammar: <<~GRAMMAR, + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec IF - ID + %lex-prec RANGLE -s RSHIFT + + %% + + program + : kw + | ident + | templ + | shift_expr + ; + + kw + : P shared IF + ; + + ident + : Q shared ID + ; + + templ + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + shared + : n1 + ; + + n1 + : n2 + ; + + n2 + : MARK + ; + GRAMMAR + } + }.each do |label, attrs| + it "keeps #{label} scanner-compatible" do + grammar = Lrama::Parser.new(attrs[:grammar], attrs[:path]).parse + grammar.prepare + grammar.validate! + + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + expect(pslr_states.states_count).to be > ielr_states.states_count + expect(pslr_states.pslr_inadequacies).to be_empty + end + end end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index c832a1ce..338404c1 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -26,6 +26,9 @@ end require "lrama" +Dir[File.expand_path("support/**/*.rb", __dir__)].sort.each do |file| + require file +end module RSpecHelper def fixture_path(file_name) diff --git a/spec/support/pslr_family_helper.rb b/spec/support/pslr_family_helper.rb new file mode 100644 index 00000000..93f54e6e --- /dev/null +++ b/spec/support/pslr_family_helper.rb @@ -0,0 +1,159 @@ +# frozen_string_literal: true + +module PslrFamilyHelper + def build_grammar(source, path) + grammar = Lrama::Parser.new(source, path).parse + grammar.prepare + grammar.validate! + grammar + end + + def compute_ielr_and_pslr(grammar) + ielr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + ielr_states.compute + ielr_states.compute_ielr + + pslr_states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + pslr_states.compute + pslr_states.compute_pslr + + [ielr_states, pslr_states] + end + + def acceptable_tokens(states, state) + states.send(:acceptable_tokens_for_pslr, state).to_a + end + + def shared_chain_rules(name:, terminal:, depth:, prefix: "n") + return <<~RULES if depth.zero? + #{name} + : #{terminal} + ; + RULES + + rules = [<<~RULE] + #{name} + : #{prefix}1 + ; + RULE + + 1.upto(depth - 1) do |index| + rules << <<~RULE + #{prefix}#{index} + : #{prefix}#{index + 1} + ; + RULE + end + + rules << <<~RULE + #{prefix}#{depth} + : #{terminal} + ; + RULE + + rules.join("\n") + end + + def keyword_context_source(depth:) + <<~GRAMMAR + %define lr.type pslr + %token-pattern P /p/ + %token-pattern Q /q/ + %token-pattern X /x/ + %token-pattern IF /if/ + %token-pattern ID /[a-z]+/ + %lex-prec IF - ID + + %% + + program + : kw_context + | id_context + ; + + kw_context + : P shared IF + ; + + id_context + : Q shared ID + ; + + #{shared_chain_rules(name: "shared", terminal: "X", depth: depth)} + GRAMMAR + end + + def shift_angle_source(depth:) + <<~GRAMMAR + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %token-pattern ID /[a-z]+/ + %lex-prec RANGLE -s RSHIFT + + %% + + program + : template_expr + | shift_expr + ; + + template_expr + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + #{shared_chain_rules(name: "shared", terminal: "MARK", depth: depth)} + GRAMMAR + end + + def mixed_context_source(depth:) + <<~GRAMMAR + %define lr.type pslr + %token-pattern LT />/ + %token-pattern RANGLE />/ + %lex-prec IF - ID + %lex-prec RANGLE -s RSHIFT + + %% + + program + : kw_context + | id_context + | template_expr + | shift_expr + ; + + kw_context + : P shared IF + ; + + id_context + : Q shared ID + ; + + template_expr + : LT shared RANGLE + ; + + shift_expr + : START shared RSHIFT ID + ; + + #{shared_chain_rules(name: "shared", terminal: "MARK", depth: depth)} + GRAMMAR + end +end diff --git a/template/bison/_yacc.h b/template/bison/_yacc.h index 3e270c91..7cf4b6b2 100644 --- a/template/bison/_yacc.h +++ b/template/bison/_yacc.h @@ -71,6 +71,10 @@ struct YYLTYPE <%-# b4_declare_yyparse -%> int yyparse (<%= output.parse_param %>); +<%- if output.pslr_enabled? -%> +<%= output.pslr_function_declarations %> +<%- end -%> + <%= output.percent_code("provides") %> <%-# b4_cpp_guard_close([b4_spec_mapped_header_file]) -%> diff --git a/template/bison/yacc.c b/template/bison/yacc.c index 6edd59a0..e9c86485 100644 --- a/template/bison/yacc.c +++ b/template/bison/yacc.c @@ -582,6 +582,39 @@ static const <%= output.int_type_for(output.context.yyr2) %> yyr2[] = <%= output.int_array_to_string(output.context.yyr2) %> }; +<%- if output.pslr_enabled? -%> +<%= output.pslr_function_declarations %> +<%- end -%> + +<%- if output.pslr_enabled? -%> +<%= output.pslr_tables_and_functions %> +<%- end -%> + +<%- if output.pslr_enabled? -%> +#ifndef YYSETSTATE_CONTEXT +# define YYSETSTATE_CONTEXT(CurrentState) ((void) 0) +#endif +<%- end -%> + +int +yy_state_accepts_token (int yystate, int yychar) +{ + yysymbol_kind_t yytoken = YYTRANSLATE (yychar); + int yyn = yypact[yystate]; + + if (yypact_value_is_default (yyn)) + return 0; + + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + return 0; + + yyn = yytable[yyn]; + if (yyn <= 0) + return !yytable_value_is_error (yyn); + + return 1; +} enum { YYENOMEM = -2 }; @@ -1582,6 +1615,9 @@ YYLTYPE yylloc = yyloc_default; YY_IGNORE_USELESS_CAST_BEGIN *yyssp = YY_CAST (yy_state_t, yystate); YY_IGNORE_USELESS_CAST_END +<%- if output.pslr_enabled? -%> + YYSETSTATE_CONTEXT (yystate); +<%- end -%> YY_STACK_PRINT (yyss, yyssp<%= output.user_args %>); if (yyss + yystacksize - 1 <= yyssp) @@ -2065,4 +2101,3 @@ YYLTYPE yylloc = yyloc_default; #line <%= output.aux.epilogue_first_lineno - 1 %> "<%= output.grammar_file_path %>" <%= output.aux.epilogue -%> <%- end -%> -