Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions src/main/java/org/perlonjava/lexer/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ private int getCurrentCodePoint() {
return c1;
}

private static boolean isPerlIdentifierStart(int codePoint) {
return codePoint == '_' || UCharacter.hasBinaryProperty(codePoint, UProperty.XID_START);
}

private static boolean isPerlIdentifierPart(int codePoint) {
return codePoint == '_' || UCharacter.hasBinaryProperty(codePoint, UProperty.XID_CONTINUE);
}

private void advanceCodePoint(int codePoint) {
position += Character.charCount(codePoint);
}
Expand Down Expand Up @@ -150,7 +158,7 @@ public LexerToken nextToken() {
}
} else if (current >= '0' && current <= '9') {
return consumeNumber();
} else if (currentCp == '_' || Character.isUnicodeIdentifierStart(currentCp)) {
} else if (isPerlIdentifierStart(currentCp)) {
return consumeIdentifier();
} else if (current < 128 && isOperator[current]) {
return consumeOperator();
Expand Down Expand Up @@ -187,7 +195,7 @@ public LexerToken consumeIdentifier() {

while (position < length) {
int curCp = getCurrentCodePoint();
if (curCp == '_' || Character.isUnicodeIdentifierPart(curCp)) {
if (isPerlIdentifierPart(curCp)) {
advanceCodePoint(curCp);
} else {
break;
Expand Down
28 changes: 21 additions & 7 deletions src/main/java/org/perlonjava/parser/IdentifierParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ public static String parseComplexIdentifier(Parser parser, boolean isTypeglob) {
if (tokenAfter.type == LexerTokenType.EOF || tokenAfter.type == LexerTokenType.NEWLINE) {
parser.throwError("syntax error");
}

// Perl does not allow whitespace to turn into a punctuation special variable.
// For example "$\t = 4" must be a syntax error, not "$= 4".
if (tokenAfter.type == LexerTokenType.OPERATOR
&& tokenAfter.text.length() == 1
&& "!|/*+-<>&~.=%'".indexOf(tokenAfter.text.charAt(0)) >= 0) {
parser.throwError("syntax error");
}
}

// Check if the identifier is enclosed in braces
Expand Down Expand Up @@ -201,22 +209,27 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr

// Under 'no utf8', Perl allows many non-ASCII bytes as length-1 variables.
// Only enforce XID_START there for multi-character identifiers.
boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8);
boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
&& !parser.ctx.compilerOptions.isEvalbytes;
boolean hasMoreIdentifierContent = insideBraces
&& (nextToken.type == LexerTokenType.IDENTIFIER || nextToken.type == LexerTokenType.NUMBER);
boolean mustValidateStart = utf8Enabled || id.length() > 1 || hasMoreIdentifierContent;

// Always reject the Unicode replacement character: it usually indicates an invalid byte sequence.
// Perl reports these as unrecognized bytes (e.g. \xB6 in comp/parser_run.t test 66).
// Also reject control characters (0x00-0x1F, 0x7F) as identifier starts.
if (cp == 0xFFFD || cp < 32 || cp == 127 || (mustValidateStart && !valid)) {
// Reject control characters and other non-graphic bytes that Perl treats as invalid variable names.
// In particular, C1 controls (0x80-0x9F) must always be rejected.
if (cp == 0xFFFD
|| cp < 32
|| cp == 127
|| (cp >= 0x80 && cp <= 0x9F)
|| (mustValidateStart && !valid)) {
String hex;
// Special case: if we got the Unicode replacement character (0xFFFD),
// it likely means the original was an invalid UTF-8 byte sequence.
// For Perl compatibility, we should report common invalid bytes like \xB6
if (cp == 0xFFFD || cp == 0x00B6) {
// This is likely \xB6 (182) which gets converted to replacement char
// For now, assume it's \xB6 to match the test expectation
// For Perl compatibility, we should report a representative invalid byte.
if (cp == 0xFFFD) {
hex = "\\xB6";
} else {
if (cp <= 255) {
Expand All @@ -240,7 +253,8 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
int cp = id.codePointAt(0);
boolean valid = cp == '_' || UCharacter.hasBinaryProperty(cp, UProperty.XID_START);

boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8);
boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
&& !parser.ctx.compilerOptions.isEvalbytes;
boolean mustValidateStart = utf8Enabled || id.length() > 1;

if (mustValidateStart && !valid) {
Expand Down