Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 97 additions & 23 deletions ext/json/ext/parser/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -479,21 +479,16 @@ static const signed char digit_values[256] = {

static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p)
{
signed char b;
uint32_t result = 0;
b = digit_values[p[0]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[1]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[2]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[3]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
return result;
signed char b0 = digit_values[p[0]];
signed char b1 = digit_values[p[1]];
signed char b2 = digit_values[p[2]];
signed char b3 = digit_values[p[3]];
Comment on lines +482 to +485
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is a risk of reading past the end of the buffer here.

Prior digit_values[p[0]] < 0 would return if p[0] is a NULL byte.

I think this optimization does make sense, but we need to ensure we can actually read 4 bytes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I checked all the callsites, they all already ensure that.

So I think I'm good with that optimization, except I think I'd like to refactor the bound check inside the function.


if ((b0 | b1 | b2 | b3) < 0) {
raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
}

return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3;
}

#define GET_PARSER_CONFIG \
Expand Down Expand Up @@ -643,9 +638,58 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserCon
typedef struct _json_unescape_positions {
long size;
const char **positions;
bool has_more;
unsigned long additional_backslashes;
} JSON_UnescapePositions;

ALWAYS_INLINE(static) void *find_backslash(const void *src, size_t n) {
// HAVE_SIMD_NEON and JSON_CPU_LITTLE_ENDIAN_64BITS are implied by __APPLE__ && __aarch64__
// but they are here for clarity and consistency with code in this file.
#if defined(__APPLE__) && defined(__aarch64__) && HAVE_SIMD_NEON && JSON_CPU_LITTLE_ENDIAN_64BITS
const unsigned char *s = (const unsigned char *)src;

static const uint8_t offsets[16] = { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 };
while (n >= sizeof(uint8x16_t)) {
uint8x16_t chunk = vld1q_u8(s);
uint8x16_t backslashes = vdupq_n_u8('\\');
uint8x16_t has_backslashes = vceqq_u8(chunk, backslashes);
uint8x16_t backslash_offsets = vandq_u8(has_backslashes, vld1q_u8(offsets));
int first_backslash_offset = vmaxvq_u8(backslash_offsets);
if (first_backslash_offset) {
// The indexes are stored in reverse order so we need to subtract from 16
// to get the first backslash offset. We do this to avoid having to use
// a negation + OR operation along with a vminvq_u8 if the indexes were stored
// in normal order.
return (void *)(s + (16 - first_backslash_offset));
}
s += sizeof(uint8x16_t);
n -= sizeof(uint8x16_t);
}

if (n >= sizeof(uint64_t)) {
uint64_t word;
memcpy(&word, s, sizeof(uint64_t));
uint64_t xor = word ^ 0x5c5c5c5c5c5c5c5c;
uint64_t has_backslash = (xor - 0x0101010101010101) & ((~xor) & 0x8080808080808080);
if (has_backslash) {
int byte_offset = trailing_zeros64(has_backslash) / CHAR_BIT;
return (void *)(s + byte_offset);
}
s += sizeof(uint64_t);
n -= sizeof(uint64_t);
}

for (size_t i = 0; i < n; i++) {
if (s[i] == '\\') {
return (void *)(s + i);
}
}

return NULL;
#else
return memchr(src, '\\', n);
#endif
}

static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
{
while (positions->size) {
Expand All @@ -657,13 +701,43 @@ static inline const char *json_next_backslash(const char *pe, const char *string
}
}

if (positions->has_more) {
return memchr(pe, '\\', stringEnd - pe);
if (positions->additional_backslashes) {
positions->additional_backslashes--;
return find_backslash(pe, stringEnd - pe);
}

return NULL;
}

static inline void json_memcpy(char *dest, const char *src, size_t size) {
#if defined(__APPLE__) && defined(__aarch64__) && HAVE_SIMD_NEON && JSON_CPU_LITTLE_ENDIAN_64BITS
while (size >= sizeof(uint8x16_t)) {
uint8x16_t chunk;
chunk = vld1q_u8((const uint8_t *)src);
vst1q_u8((uint8_t *)dest, chunk);
dest += sizeof(uint8x16_t);
src += sizeof(uint8x16_t);
size -= sizeof(uint8x16_t);
}

if (size >= sizeof(uint64_t)) {
uint64_t chunk;
memcpy(&chunk, src, sizeof(uint64_t));
memcpy(dest, &chunk, sizeof(uint64_t));
dest += sizeof(uint64_t);
src += sizeof(uint64_t);
size -= sizeof(uint64_t);
}

while(size) {
*dest++ = *src++;
size--;
}
#else
memcpy(dest, src, size);
#endif
}

NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions)
{
bool intern = is_name || config->freeze;
Expand All @@ -681,7 +755,7 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser

while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) {
if (pe > p) {
MEMCPY(buffer, p, char, pe - p);
json_memcpy(buffer, p, pe - p);
buffer += pe - p;
}
switch (*++pe) {
Expand Down Expand Up @@ -746,7 +820,7 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser

char buf[4];
int unescape_len = convert_UTF32_to_UTF8(buf, ch);
MEMCPY(buffer, buf, char, unescape_len);
json_memcpy(buffer, buf, unescape_len);
buffer += unescape_len;
p = ++pe;
}
Expand All @@ -768,7 +842,7 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser
#undef APPEND_CHAR

if (stringEnd > p) {
MEMCPY(buffer, p, char, stringEnd - p);
json_memcpy(buffer, p, stringEnd - p);
buffer += stringEnd - p;
}
rb_str_set_len(result, buffer - bufferStart);
Expand Down Expand Up @@ -992,7 +1066,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi
JSON_UnescapePositions positions = {
.size = 0,
.positions = backslashes,
.has_more = false,
.additional_backslashes = 0,
};

do {
Expand All @@ -1007,7 +1081,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi
backslashes[positions.size] = state->cursor;
positions.size++;
} else {
positions.has_more = true;
positions.additional_backslashes++;
}
state->cursor++;
break;
Expand Down
20 changes: 20 additions & 0 deletions test/json/json_parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,10 @@ def test_backslash
json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]'
assert_equal data, parse(json)

data = ['""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""']
json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]'
assert_equal data, parse(json)

data = '["This is a "test" of the emergency broadcast system."]'
json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\""
assert_equal data, parse(json)
Expand Down Expand Up @@ -611,6 +615,10 @@ def test_backslash
json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002c\""
assert_equal data, parse(json)

data = "ab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002c"
json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002c\""
assert_equal data, parse(json)

data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f"
json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\""
assert_equal data, parse(json)
Expand All @@ -619,9 +627,21 @@ def test_backslash
json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\""
assert_equal data, parse(json)

data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b"
json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\""
assert_equal data, parse(json)

data = "a\n\t\f\b\n\t\f\b\n\t\f\b\n\t"
json = "\"a\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\""
assert_equal data, parse(json)

data = "a\n\t\f\b\n\t\f\b\n\t\f\b\n\ta\n\t\f\b\n\t\f\b\n\t\f\b\n\ta\n\t\f\b\n\t\f\b\n\t\f\b\n\ta\n\t\f\b\n\t\f\b\n\t\f\b\n\ta\n\t\f\b\n\t\f\b\n\t\f\b\n\t"
json = "\"a\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\ta\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\ta\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\ta\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\ta\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\""
assert_equal data, parse(json)

data = "\n" * 63
json = "\""+("\\n" * 63)+"\""
assert_equal data, parse(json)
end

class SubArray < Array
Expand Down