From 8b4d94884322d2591fa608add631a280b85966f7 Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Thu, 29 Jan 2026 14:45:37 +0100 Subject: [PATCH] Lexer/IdentifierParser: align identifier rules and tighten whitespace+sigil parsing --- src/main/java/org/perlonjava/lexer/Lexer.java | 12 ++++++-- .../perlonjava/parser/IdentifierParser.java | 28 ++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/perlonjava/lexer/Lexer.java b/src/main/java/org/perlonjava/lexer/Lexer.java index eac727e8..96fd6b8f 100644 --- a/src/main/java/org/perlonjava/lexer/Lexer.java +++ b/src/main/java/org/perlonjava/lexer/Lexer.java @@ -80,6 +80,14 @@ private int getCurrentCodePoint() { return c1; } + private static boolean isPerlIdentifierStart(int codePoint) { + return codePoint == '_' || UCharacter.hasBinaryProperty(codePoint, UProperty.XID_START); + } + + private static boolean isPerlIdentifierPart(int codePoint) { + return codePoint == '_' || UCharacter.hasBinaryProperty(codePoint, UProperty.XID_CONTINUE); + } + private void advanceCodePoint(int codePoint) { position += Character.charCount(codePoint); } @@ -150,7 +158,7 @@ public LexerToken nextToken() { } } else if (current >= '0' && current <= '9') { return consumeNumber(); - } else if (currentCp == '_' || Character.isUnicodeIdentifierStart(currentCp)) { + } else if (isPerlIdentifierStart(currentCp)) { return consumeIdentifier(); } else if (current < 128 && isOperator[current]) { return consumeOperator(); @@ -187,7 +195,7 @@ public LexerToken consumeIdentifier() { while (position < length) { int curCp = getCurrentCodePoint(); - if (curCp == '_' || Character.isUnicodeIdentifierPart(curCp)) { + if (isPerlIdentifierPart(curCp)) { advanceCodePoint(curCp); } else { break; diff --git a/src/main/java/org/perlonjava/parser/IdentifierParser.java b/src/main/java/org/perlonjava/parser/IdentifierParser.java index a352f324..7c7b7baa 100644 --- a/src/main/java/org/perlonjava/parser/IdentifierParser.java +++ b/src/main/java/org/perlonjava/parser/IdentifierParser.java @@ -58,6 +58,14 @@ public static String parseComplexIdentifier(Parser parser, boolean isTypeglob) { if (tokenAfter.type == LexerTokenType.EOF || tokenAfter.type == LexerTokenType.NEWLINE) { parser.throwError("syntax error"); } + + // Perl does not allow whitespace to turn into a punctuation special variable. + // For example "$\t = 4" must be a syntax error, not "$= 4". + if (tokenAfter.type == LexerTokenType.OPERATOR + && tokenAfter.text.length() == 1 + && "!|/*+-<>&~.=%'".indexOf(tokenAfter.text.charAt(0)) >= 0) { + parser.throwError("syntax error"); + } } // Check if the identifier is enclosed in braces @@ -201,7 +209,8 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr // Under 'no utf8', Perl allows many non-ASCII bytes as length-1 variables. // Only enforce XID_START there for multi-character identifiers. - boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8); + boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8) + && !parser.ctx.compilerOptions.isEvalbytes; boolean hasMoreIdentifierContent = insideBraces && (nextToken.type == LexerTokenType.IDENTIFIER || nextToken.type == LexerTokenType.NUMBER); boolean mustValidateStart = utf8Enabled || id.length() > 1 || hasMoreIdentifierContent; @@ -209,14 +218,18 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr // Always reject the Unicode replacement character: it usually indicates an invalid byte sequence. // Perl reports these as unrecognized bytes (e.g. \xB6 in comp/parser_run.t test 66). // Also reject control characters (0x00-0x1F, 0x7F) as identifier starts. - if (cp == 0xFFFD || cp < 32 || cp == 127 || (mustValidateStart && !valid)) { + // Reject control characters and other non-graphic bytes that Perl treats as invalid variable names. + // In particular, C1 controls (0x80-0x9F) must always be rejected. + if (cp == 0xFFFD + || cp < 32 + || cp == 127 + || (cp >= 0x80 && cp <= 0x9F) + || (mustValidateStart && !valid)) { String hex; // Special case: if we got the Unicode replacement character (0xFFFD), // it likely means the original was an invalid UTF-8 byte sequence. - // For Perl compatibility, we should report common invalid bytes like \xB6 - if (cp == 0xFFFD || cp == 0x00B6) { - // This is likely \xB6 (182) which gets converted to replacement char - // For now, assume it's \xB6 to match the test expectation + // For Perl compatibility, we should report a representative invalid byte. + if (cp == 0xFFFD) { hex = "\\xB6"; } else { if (cp <= 255) { @@ -240,7 +253,8 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr int cp = id.codePointAt(0); boolean valid = cp == '_' || UCharacter.hasBinaryProperty(cp, UProperty.XID_START); - boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8); + boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8) + && !parser.ctx.compilerOptions.isEvalbytes; boolean mustValidateStart = utf8Enabled || id.length() > 1; if (mustValidateStart && !valid) {