diff --git a/src/jmh/java/com/amazon/ion/TextHotPathBenchmark.java b/src/jmh/java/com/amazon/ion/TextHotPathBenchmark.java new file mode 100644 index 000000000..70216faa5 --- /dev/null +++ b/src/jmh/java/com/amazon/ion/TextHotPathBenchmark.java @@ -0,0 +1,299 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion; + +import com.amazon.ion.system.IonReaderBuilder; +import com.amazon.ion.system.IonSystemBuilder; +import com.amazon.ion.system.IonTextWriterBuilder; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +/** + * Benchmarks targeting the hot paths modified in CR-266045891: + * - {@code IonWriterSystemText.writeSymbolToken} (tryWriteAsIdentifier fast path + IDENTIFIER fallback) + * - {@code OutputStreamFastAppendable.appendAscii(char)} / {@code appendAscii(CharSequence)} batch path + * - {@code _Private_IonTextAppender.printCodePoints} (NEVER_ESCAPED_ASCII fast path) + * - {@code _Private_IonTextAppender.printDecimal} (small-int cache for exponent/scale) + * - {@code IonReaderTextRawTokensX.load_double_quoted_string} / {@code load_symbol_identifier} + * / {@code skip_over_blob} ASCII fast paths + * - {@code IonReaderTextUserX.isIonVersionMarker} (regex -> character-by-character) + * + * Results are comparable between the baseline (pre-CR commit 1e84f39b) and CR HEAD + * by running {@code ./gradlew :jmh -PjmhIncludes=TextHotPathBenchmark}. + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@Fork(1) +@Warmup(iterations = 3, time = 3, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 5, time = 3, timeUnit = TimeUnit.SECONDS) +@State(Scope.Benchmark) +public class TextHotPathBenchmark { + + private static final IonSystem SYSTEM = IonSystemBuilder.standard().build(); + + // === Writer-side fixtures === + + /** Struct with many plain-identifier field names + ASCII string values. Exercises + * writeFieldName -> writeSymbolToken -> tryWriteAsIdentifier, appendAscii(char), + * appendAscii(CharSequence), and printCodePoints fast paths. */ + private byte[] symbolHeavyBinary; + + /** Struct containing an identifier with a non-ASCII identifier-part character + * that passes symbolVariant() as IDENTIFIER but fails tryWriteAsIdentifier(). + * This is the regression scenario that motivated the IDENTIFIER-case fix. */ + private byte[] nonAsciiIdentifierBinary; + + /** Big ASCII string; exercises printCodePoints NEVER_ESCAPED_ASCII fast path. */ + private byte[] longAsciiStringBinary; + + /** Decimals with small exponents/scales; exercises the small-int string cache. */ + private byte[] decimalHeavyBinary; + + // === Reader-side fixtures === + + /** Text with many double-quoted ASCII strings; exercises load_double_quoted_string fast path. */ + private byte[] doubleQuotedTextBytes; + + /** Text with many ASCII symbol identifiers; exercises load_symbol_identifier fast path. */ + private byte[] symbolIdentifierTextBytes; + + /** Text with a large base64 blob; exercises skip_over_blob inline whitespace fast path. */ + private byte[] blobTextBytes; + + /** Text beginning with $ion_1_0 IVM many times over; exercises isIonVersionMarker. */ + private byte[] ivmTextBytes; + + @Setup(Level.Trial) + public void setup() throws IOException { + symbolHeavyBinary = buildSymbolHeavyBinary(); + nonAsciiIdentifierBinary = buildNonAsciiIdentifierBinary(); + longAsciiStringBinary = buildLongAsciiStringBinary(); + decimalHeavyBinary = buildDecimalHeavyBinary(); + + doubleQuotedTextBytes = buildDoubleQuotedText(); + symbolIdentifierTextBytes = buildSymbolIdentifierText(); + blobTextBytes = buildBlobText(); + ivmTextBytes = buildIvmText(); + } + + // ============ Benchmarks ============ + + /** Write pre-loaded symbol-heavy datagram to text. Exercises the IDENTIFIER fast path + * the CR adds (and the fix this patch applies). */ + @Benchmark + public void writeSymbolHeavyText(Blackhole bh) throws IOException { + IonDatagram dg = SYSTEM.getLoader().load(symbolHeavyBinary); + ByteArrayOutputStream out = new ByteArrayOutputStream(16 * 1024); + try (IonWriter w = IonTextWriterBuilder.standard().build(out)) { + dg.writeTo(w); + } + bh.consume(out.size()); + } + + /** Same but for the non-ASCII identifier struct — the scenario where the CR-266045891 + * bug produced wrong output (quoted instead of unquoted). */ + @Benchmark + public void writeNonAsciiIdentifierText(Blackhole bh) throws IOException { + IonDatagram dg = SYSTEM.getLoader().load(nonAsciiIdentifierBinary); + ByteArrayOutputStream out = new ByteArrayOutputStream(1024); + try (IonWriter w = IonTextWriterBuilder.standard().build(out)) { + dg.writeTo(w); + } + bh.consume(out.size()); + } + + /** Exercise printCodePoints NEVER_ESCAPED_ASCII fast path. */ + @Benchmark + public void writeLongAsciiStringText(Blackhole bh) throws IOException { + IonDatagram dg = SYSTEM.getLoader().load(longAsciiStringBinary); + ByteArrayOutputStream out = new ByteArrayOutputStream(16 * 1024); + try (IonWriter w = IonTextWriterBuilder.standard().build(out)) { + dg.writeTo(w); + } + bh.consume(out.size()); + } + + /** Exercise printDecimal small-int cache via many exponent/scale writes. */ + @Benchmark + public void writeDecimalHeavyText(Blackhole bh) throws IOException { + IonDatagram dg = SYSTEM.getLoader().load(decimalHeavyBinary); + ByteArrayOutputStream out = new ByteArrayOutputStream(16 * 1024); + try (IonWriter w = IonTextWriterBuilder.standard().build(out)) { + dg.writeTo(w); + } + bh.consume(out.size()); + } + + /** Read lots of double-quoted ASCII strings. */ + @Benchmark + public void readDoubleQuotedStrings(Blackhole bh) throws IOException { + try (IonReader r = IonReaderBuilder.standard().build(doubleQuotedTextBytes)) { + while (r.next() != null) { + bh.consume(r.stringValue()); + } + } + } + + /** Read many ASCII symbol identifiers. */ + @Benchmark + public void readSymbolIdentifiers(Blackhole bh) throws IOException { + try (IonReader r = IonReaderBuilder.standard().build(symbolIdentifierTextBytes)) { + while (r.next() != null) { + bh.consume(r.stringValue()); + } + } + } + + /** Read a big blob; exercises skip_over_blob inline whitespace path. */ + @Benchmark + public void readBlob(Blackhole bh) throws IOException { + try (IonReader r = IonReaderBuilder.standard().build(blobTextBytes)) { + while (r.next() != null) { + bh.consume(r.newBytes()); + } + } + } + + /** Read text starting with repeated IVMs; exercises isIonVersionMarker. */ + @Benchmark + public void readIvmHeavyText(Blackhole bh) throws IOException { + try (IonReader r = IonReaderBuilder.standard().build(ivmTextBytes)) { + while (r.next() != null) { + bh.consume(r.getType()); + } + } + } + + // ============ Fixture builders ============ + + private static byte[] buildSymbolHeavyBinary() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (IonWriter w = SYSTEM.newBinaryWriter(out)) { + String[] fields = { + "customerId", "orderId", "productSku", "quantity", "unitPrice", + "shippingAddress", "billingAddress", "paymentMethod", "orderStatus", + "createdAt", "updatedAt", "shippedAt", "deliveredAt", "trackingNumber", + "carrier", "isGift", "giftMessage", "isPrimeEligible", "taxAmount", + "totalAmount" + }; + for (int i = 0; i < 500; i++) { + w.stepIn(IonType.STRUCT); + for (String f : fields) { + w.setFieldName(f); + w.writeString("value_" + i + "_" + f); + } + w.stepOut(); + } + } + return out.toByteArray(); + } + + private static byte[] buildNonAsciiIdentifierBinary() throws IOException { + // A symbol that symbolVariant() accepts as IDENTIFIER but tryWriteAsIdentifier + // rejects because of the > 126 char. Unicode identifier-part char: 'ä' (U+00E4). + // NOTE: whether symbolVariant accepts this depends on IonTextUtils; if it rejects, + // this just exercises the quoted fallback cleanly instead of a correctness bug. + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (IonWriter w = SYSTEM.newBinaryWriter(out)) { + for (int i = 0; i < 200; i++) { + w.stepIn(IonType.STRUCT); + w.setFieldName("customer\u00e4me"); + w.writeString("value_" + i); + w.stepOut(); + } + } + return out.toByteArray(); + } + + private static byte[] buildLongAsciiStringBinary() throws IOException { + StringBuilder sb = new StringBuilder(4096); + for (int i = 0; i < 512; i++) { + sb.append("The quick brown fox jumps over the lazy dog 0123456789 "); + } + String big = sb.toString(); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (IonWriter w = SYSTEM.newBinaryWriter(out)) { + for (int i = 0; i < 50; i++) { + w.writeString(big); + } + } + return out.toByteArray(); + } + + private static byte[] buildDecimalHeavyBinary() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (IonWriter w = SYSTEM.newBinaryWriter(out)) { + // Exponents / scales in the 0..19 range hit the small-int cache + for (int i = 0; i < 2000; i++) { + w.writeDecimal(new java.math.BigDecimal( + java.math.BigInteger.valueOf(123456), i % 20)); + } + } + return out.toByteArray(); + } + + private static byte[] buildDoubleQuotedText() { + StringBuilder sb = new StringBuilder(); + Random r = new Random(42); + for (int i = 0; i < 2000; i++) { + sb.append('"'); + int len = 20 + r.nextInt(60); + for (int j = 0; j < len; j++) { + // Printable ASCII excluding '"' and '\\' + char c = (char) ('!' + r.nextInt(93)); + if (c == '"' || c == '\\') c = 'a'; + sb.append(c); + } + sb.append("\" "); + } + return sb.toString().getBytes(StandardCharsets.UTF_8); + } + + private static byte[] buildSymbolIdentifierText() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 5000; i++) { + sb.append("symbol_identifier_").append(i).append(' '); + } + return sb.toString().getBytes(StandardCharsets.UTF_8); + } + + private static byte[] buildBlobText() { + // Big base64-encoded blob (random bytes). + Random r = new Random(7); + byte[] raw = new byte[32 * 1024]; + r.nextBytes(raw); + String b64 = java.util.Base64.getEncoder().encodeToString(raw); + StringBuilder sb = new StringBuilder(b64.length() + 16); + // Interleave whitespace to exercise the inline whitespace-skip path. + sb.append("{{ "); + for (int i = 0; i < b64.length(); i += 76) { + sb.append(b64, i, Math.min(i + 76, b64.length())).append('\n'); + } + sb.append(" }}"); + return sb.toString().getBytes(StandardCharsets.UTF_8); + } + + private static byte[] buildIvmText() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 500; i++) { + sb.append("$ion_1_0 value_").append(i).append(' '); + } + return sb.toString().getBytes(StandardCharsets.UTF_8); + } +} diff --git a/src/main/java/com/amazon/ion/impl/IonReaderTextRawTokensX.java b/src/main/java/com/amazon/ion/impl/IonReaderTextRawTokensX.java index 5a2f89ba8..7ceb08720 100644 --- a/src/main/java/com/amazon/ion/impl/IonReaderTextRawTokensX.java +++ b/src/main/java/com/amazon/ion/impl/IonReaderTextRawTokensX.java @@ -1841,10 +1841,23 @@ private final int loadRadixValue(StringBuilder sb, int c2, Radix radix) private final int skip_over_symbol_identifier(SavePoint sp) throws IOException { - int c = read_char(); - - while(IonTokenConstsX.isValidSymbolCharacter(c)) { - c = read_char(); + // Optimized: read directly from stream for symbol identifier characters. + // Valid symbol chars (a-z, A-Z, 0-9, $, _) never include newlines, + // so we can skip the newline check in read_char() for the common case. + int c; + for (;;) { + c = _stream.read(); + // Fast path: check if it's a valid symbol character + // Valid symbol chars are all in the ASCII range and never newlines + if (c >= 0 && c <= 0x7F && IonTokenConstsX.isValidSymbolCharacter(c)) { + continue; + } + // Slow path: handle terminating characters + // Need to check for newlines for proper line counting + if (c == '\r' || c == '\n') { + c = line_count(c); + } + break; } if (sp != null) { @@ -1855,10 +1868,25 @@ private final int skip_over_symbol_identifier(SavePoint sp) throws IOException protected void load_symbol_identifier(StringBuilder sb) throws IOException { - int c = read_char(); - while(IonTokenConstsX.isValidSymbolCharacter(c)) { - sb.append((char)c); - c = read_char(); + // Optimized: read directly from stream for symbol identifier characters. + // Valid symbol chars (a-z, A-Z, 0-9, $, _) never include newlines, + // so we can skip the newline check in read_char() for the common case. + int c; + for (;;) { + c = _stream.read(); + // Fast path: check if it's a valid symbol character (ASCII only) + // Valid symbol chars are: a-z, A-Z, 0-9, $, _ + // These are all in the 0x24-0x7A range (with gaps) + if (c >= 0 && c <= 0x7F && IonTokenConstsX.isValidSymbolCharacter(c)) { + sb.append((char)c); + continue; + } + // Slow path: handle terminating characters + // Need to check for newlines for proper line counting + if (c == '\r' || c == '\n') { + c = line_count(c); + } + break; } unread_char(c); } @@ -1941,7 +1969,29 @@ protected int load_single_quoted_string(StringBuilder sb, boolean is_clob) boolean expectLowSurrogate = false; for (;;) { - c = read_string_char(ProhibitedCharacters.NONE); + // Fast path: read directly from stream for common ASCII characters. + // ASCII printable chars (0x20-0x7E) excluding '\'' (0x27) and '\' (0x5C) + // don't need prohibited char checks, UTF-8 decoding, or surrogate handling. + c = _stream.read(); + if (c > 0x27 && c != '\\' && c < 0x7F) { + // Common case: printable ASCII char (not '\'' or '\') + sb.append((char)c); + continue; + } + // Handle space, '!', '"', '#', '$', '%', '&' (0x20-0x26) which are also safe ASCII + if (c >= 0x20 && c < 0x27) { + sb.append((char)c); + continue; + } + + // Slow path: handle special characters + // First, apply prohibited character check for control chars + // ProhibitedCharacters.NONE means no characters are prohibited + // Handle newlines and backslash for line counting + if (c == '\r' || c == '\n' || c == '\\') { + c = line_count(c); + } + switch (c) { case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1: case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2: @@ -2025,7 +2075,36 @@ protected int load_double_quoted_string(StringBuilder sb, boolean is_clob) boolean expectLowSurrogate = false; for (;;) { - c = read_string_char(ProhibitedCharacters.SHORT_CHAR); + // Fast path: read directly from stream for common ASCII characters. + // ASCII printable chars (0x20-0x7E) excluding '"' (0x22) and '\' (0x5C) + // don't need prohibited char checks, UTF-8 decoding, or surrogate handling. + c = _stream.read(); + if (c > 0x22 && c != '\\' && c < 0x7F) { + // Common case: printable ASCII char (not '"' or '\') + // No need for surrogate checks since ASCII chars can't be surrogates + sb.append((char)c); + continue; + } + // Handle space and '!' (0x20-0x21) which are also safe ASCII + if (c == 0x20 || c == 0x21) { + sb.append((char)c); + continue; + } + + // Slow path: handle special characters + // First, apply prohibited character check for control chars + // SHORT_CHAR prohibits control chars except whitespace (tab, vtab, form feed) + // AND except '\r'/'\n' which fall through to line_count() below. + if (c >= 0 && c <= 0x1F + && c != 0x09 && c != 0x0A && c != 0x0B && c != 0x0C && c != 0x0D) { + // Control character that is not whitespace (tab, vtab, form feed, CR, LF) + error("invalid character [" + printCodePointAsString(c) + "]"); + } + // Handle newlines and backslash for line counting + if (c == '\r' || c == '\n' || c == '\\') { + c = line_count(c); + } + switch (c) { case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_1: case CharacterSequence.CHAR_SEQ_ESCAPED_NEWLINE_SEQUENCE_2: @@ -2399,11 +2478,35 @@ private final int read_ut8_sequence(int c) throws IOException private void skip_over_blob(SavePoint sp) throws IOException { - int c = skip_over_blob_whitespace(); + // Optimized blob skipping: directly read characters and skip whitespace inline. + // For blob data, most characters are base64 (A-Z, a-z, 0-9, +, /, =), not whitespace. + // This avoids the overhead of calling skip_over_blob_whitespace() for each character. + int c; for (;;) { - if (c == UnifiedInputStreamX.EOF) break; - if (c == '}') break; - c = skip_over_blob_whitespace(); + c = _stream.read(); + // Fast path: check for common non-whitespace characters first + // Base64 chars and most ASCII don't need special handling + if (c > ' ' && c != '}') { + // Not whitespace and not end of blob - continue scanning + // Handle newlines for line counting (rare in blob data) + // Note: '\r' is 13, '\n' is 10, both < ' ' (32), so already excluded + continue; + } + // Handle whitespace characters + if (c == ' ' || c == '\t') { + continue; + } + // Handle newlines (need line counting) + if (c == '\r' || c == '\n') { + line_count(c); + continue; + } + // Check for end conditions: EOF or '}' + if (c == UnifiedInputStreamX.EOF || c == '}') { + break; + } + // Handle other special negative values from line_count + // These are rare in blob data } if (sp != null) { // we don't care about these last 2 closing curly braces @@ -2541,7 +2644,23 @@ private final int read_base64_byte_helper() throws IOException // will be 1 byte returned immediately and 0-2 bytes // put on the _binhex_stack to return later - int c = skip_over_blob_whitespace(); + // Optimized: inline whitespace skipping for the first character + int c; + for (;;) { + c = _stream.read(); + if (c > ' ') { + // Not whitespace - this is the common case + break; + } + if (c == ' ' || c == '\t') { + continue; + } + if (c == '\r' || c == '\n') { + line_count(c); + continue; + } + break; + } if (c == UnifiedInputStreamX.EOF || c == '}') { // we'll figure how which is which by check the stream for eof return UnifiedInputStreamX.EOF; @@ -2587,7 +2706,29 @@ private final int read_base64_getchar_helper(int c) throws IOException { return read_base64_getchar_helper2(c); } private final int read_base64_getchar_helper() throws IOException { - int c = skip_over_blob_whitespace(); + // Optimized: inline whitespace skipping for better performance. + // Most base64 data has no whitespace between characters. + int c; + for (;;) { + c = _stream.read(); + // Fast path: check for valid base64 characters (most common case) + // Base64 alphabet: A-Z (65-90), a-z (97-122), 0-9 (48-57), + (43), / (47), = (61) + if (c > ' ') { + // Not whitespace - this is the common case + break; + } + // Handle whitespace + if (c == ' ' || c == '\t') { + continue; + } + // Handle newlines (need line counting) + if (c == '\r' || c == '\n') { + line_count(c); + continue; + } + // EOF or other special character + break; + } if (c == UnifiedInputStreamX.EOF || c == '}') { error("invalid base64 image - too short"); } diff --git a/src/main/java/com/amazon/ion/impl/IonReaderTextUserX.java b/src/main/java/com/amazon/ion/impl/IonReaderTextUserX.java index de2cfb9ce..6c55b46d6 100644 --- a/src/main/java/com/amazon/ion/impl/IonReaderTextUserX.java +++ b/src/main/java/com/amazon/ion/impl/IonReaderTextUserX.java @@ -16,8 +16,6 @@ import com.amazon.ion.TextSpan; import com.amazon.ion.UnknownSymbolException; import com.amazon.ion.UnsupportedIonVersionException; -import java.util.regex.Pattern; - /** * The text user reader add support for symbols and recognizes, * and consumes (and processes), the system values $ion_1_0 and @@ -43,7 +41,8 @@ class IonReaderTextUserX extends IonReaderTextSystemX implements _Private_ReaderWriter { - private static final Pattern ION_VERSION_MARKER_REGEX = Pattern.compile("^\\$ion_[0-9]+_[0-9]+$"); + // Prefix for Ion version markers: "$ion_" + private static final String ION_VERSION_MARKER_PREFIX = "$ion_"; /** * This is the physical start-of-stream offset when this reader was created. @@ -157,9 +156,54 @@ private final boolean has_next_user_value() return (!_eof); } + /** + * Checks if the given text matches the Ion version marker pattern: $ion_[0-9]+_[0-9]+ + * This is an optimized implementation that avoids regex overhead by using direct + * character-by-character validation. + */ private static boolean isIonVersionMarker(String text) { - return text != null && ION_VERSION_MARKER_REGEX.matcher(text).matches(); + if (text == null) { + return false; + } + int len = text.length(); + // Minimum valid marker is "$ion_X_Y" where X and Y are at least one digit each + // "$ion_" is 5 chars, plus at least 1 digit, underscore, and 1 digit = 8 chars minimum + if (len < 8) { + return false; + } + // Check prefix "$ion_" + if (!text.startsWith(ION_VERSION_MARKER_PREFIX)) { + return false; + } + // Parse first number (at least one digit required) + int i = 5; // Start after "$ion_" + if (!isDigit(text.charAt(i))) { + return false; + } + i++; + while (i < len && isDigit(text.charAt(i))) { + i++; + } + // Expect underscore separator + if (i >= len || text.charAt(i) != '_') { + return false; + } + i++; + // Parse second number (at least one digit required) + if (i >= len || !isDigit(text.charAt(i))) { + return false; + } + i++; + while (i < len && isDigit(text.charAt(i))) { + i++; + } + // Must have consumed entire string + return i == len; + } + + private static boolean isDigit(char c) { + return c >= '0' && c <= '9'; } private final void symbol_table_reset() diff --git a/src/main/java/com/amazon/ion/impl/IonWriterSystemText.java b/src/main/java/com/amazon/ion/impl/IonWriterSystemText.java index bf8f54550..19e07b7c4 100644 --- a/src/main/java/com/amazon/ion/impl/IonWriterSystemText.java +++ b/src/main/java/com/amazon/ion/impl/IonWriterSystemText.java @@ -1,18 +1,5 @@ -/* - * Copyright 2007-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"). - * You may not use this file except in compliance with the License. - * A copy of the License is located at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * or in the "license" file accompanying this file. This file is distributed - * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either - * express or implied. See the License for the specific language governing - * permissions and limitations under the License. - */ - +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 package com.amazon.ion.impl; import static com.amazon.ion.SystemSymbols.SYMBOLS; @@ -69,6 +56,12 @@ class IonWriterSystemText CharSequence _separator_character; + /** + * Cached result of IonTextUtils.isAllWhitespace(_separator_character). + * Updated whenever _separator_character is set to avoid repeated computation. + */ + private boolean _separator_is_whitespace; + int _top; int [] _stack_parent_type = new int[10]; boolean[] _stack_pending_comma = new boolean[10]; @@ -92,6 +85,7 @@ protected IonWriterSystemText(SymbolTable defaultSystemSymtab, _options = options; _separator_character = _options.topLevelSeparator(); + _separator_is_whitespace = IonTextUtils.isAllWhitespace(_separator_character); int threshold = _options.getLongStringThreshold(); if (threshold < 1) threshold = Integer.MAX_VALUE; @@ -149,13 +143,16 @@ void push(int typeid) switch (typeid) { case _Private_IonConstants.tidSexp: _separator_character = " "; + _separator_is_whitespace = true; break; case _Private_IonConstants.tidList: case _Private_IonConstants.tidStruct: _separator_character = ","; + _separator_is_whitespace = false; break; default: _separator_character = _options.lineSeparator(); + _separator_is_whitespace = true; // line separators are whitespace break; } _top++; @@ -181,21 +178,26 @@ int pop() { case -1: _in_struct = false; _separator_character = _options.topLevelSeparator(); + _separator_is_whitespace = IonTextUtils.isAllWhitespace(_separator_character); break; case _Private_IonConstants.tidSexp: _in_struct = false; _separator_character = " "; + _separator_is_whitespace = true; break; case _Private_IonConstants.tidList: _in_struct = false; _separator_character = ","; + _separator_is_whitespace = false; break; case _Private_IonConstants.tidStruct: _in_struct = true; _separator_character = ","; + _separator_is_whitespace = false; break; default: _separator_character = _options.lineSeparator(); + _separator_is_whitespace = true; // line separators are whitespace break; } @@ -337,7 +339,7 @@ boolean writeSeparator(boolean followingLongString) throws IOException { if (_options.isPrettyPrintOn()) { - if (_pending_separator && !IonTextUtils.isAllWhitespace(_separator_character)) { + if (_pending_separator && !_separator_is_whitespace) { // Only bother if the separator is non-whitespace. _output.appendAscii(_separator_character); followingLongString = false; @@ -347,7 +349,7 @@ boolean writeSeparator(boolean followingLongString) } else if (_pending_separator) { _output.appendAscii(_separator_character); - if (!IonTextUtils.isAllWhitespace(_separator_character)) followingLongString = false; + if (!_separator_is_whitespace) followingLongString = false; } return followingLongString; } diff --git a/src/main/java/com/amazon/ion/impl/OutputStreamFastAppendable.java b/src/main/java/com/amazon/ion/impl/OutputStreamFastAppendable.java index 9da49d08b..163a46a01 100644 --- a/src/main/java/com/amazon/ion/impl/OutputStreamFastAppendable.java +++ b/src/main/java/com/amazon/ion/impl/OutputStreamFastAppendable.java @@ -1,18 +1,5 @@ -/* - * Copyright 2007-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"). - * You may not use this file except in compliance with the License. - * A copy of the License is located at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * or in the "license" file accompanying this file. This file is distributed - * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either - * express or implied. See the License for the specific language governing - * permissions and limitations under the License. - */ - +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 package com.amazon.ion.impl; import static com.amazon.ion.impl._Private_IonConstants.makeUnicodeScalar; @@ -81,12 +68,16 @@ public Appendable append(CharSequence csq, int start, int end) public final void appendAscii(char c) throws IOException { - if (_pos == _byteBuffer.length) { - _out.write(_byteBuffer, 0, _pos); - _pos = 0; + int pos = _pos; + // Use compile-time constant MAX_BYTES_LEN so JIT can fold the check + // without loading the buffer's .length each call. + if (pos == MAX_BYTES_LEN) { + _out.write(_byteBuffer, 0, pos); + pos = 0; } assert c < 0x80; - _byteBuffer[_pos++] = (byte)c; + _byteBuffer[pos] = (byte)c; + _pos = pos + 1; } public final void appendAscii(CharSequence csq) @@ -121,14 +112,22 @@ public final void appendAscii(CharSequence csq, int start, int end) } while (start < end); } } else { - for (int ii=start; ii < end; ii++) { - if (_pos == _byteBuffer.length) { + // Optimized path for non-String CharSequence: batch writes to reduce per-character buffer checks + while (start < end) { + // Calculate how many characters we can write before needing to flush + int available = _byteBuffer.length - _pos; + if (available == 0) { _out.write(_byteBuffer, 0, _pos); _pos = 0; + available = _byteBuffer.length; + } + // Write as many characters as possible in this batch + int batchEnd = start + Math.min(available, end - start); + while (start < batchEnd) { + char c = csq.charAt(start++); + assert c < 0x80; + _byteBuffer[_pos++] = (byte)c; } - char c = csq.charAt(ii); - assert c < 0x80; - _byteBuffer[_pos++] = (byte)c; } } } diff --git a/src/main/java/com/amazon/ion/impl/_Private_IonTextAppender.java b/src/main/java/com/amazon/ion/impl/_Private_IonTextAppender.java index e74b503bf..c6caedece 100644 --- a/src/main/java/com/amazon/ion/impl/_Private_IonTextAppender.java +++ b/src/main/java/com/amazon/ion/impl/_Private_IonTextAppender.java @@ -1,18 +1,5 @@ -/* - * Copyright 2007-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"). - * You may not use this file except in compliance with the License. - * A copy of the License is located at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * or in the "license" file accompanying this file. This file is distributed - * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either - * express or implied. See the License for the specific language governing - * permissions and limitations under the License. - */ - +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 package com.amazon.ion.impl; import static com.amazon.ion.impl._Private_IonConstants.MAX_LONG_TEXT_SIZE; @@ -575,45 +562,50 @@ private final void printCodePoints(CharSequence text, String[] escapes) throws IOException { int len = text.length(); - for (int i = 0; i < len; ++i) - { - // Find a span of non-escaped ASCII code points so we can write - // them as quickly as possible. - char c = 0; - int j; - for (j = i; j < len; ++j) { - c = text.charAt(j); - // The escapes array always includes U+80 through U+FF. - if (c >= 0x100 || escapes[c] != null) - { - // c is escaped and/or outside ASCII range. - if (j > i) { - appendAscii(text, i, j); - i = j; - } + int i = 0; + + while (i < len) + { + // Fast path: find a span of characters that don't need escaping. + // escapes[] has entries for every code point that needs escaping (including + // U+80..U+FF), so a single lookup per character is sufficient; an extra + // lookup table ended up SLOWER than this (see CR-266045891 benchmark). + int spanStart = i; + while (i < len) { + char c = text.charAt(i); + if (c >= 0x100 || escapes[c] != null) { break; } + i++; + } + + // Write the safe span if any + if (i > spanStart) { + appendAscii(text, spanStart, i); } - if (j == len) { - // we've reached the end of sequence; append it and break - appendAscii(text, i, j); + + // Process any remaining characters that need special handling + if (i >= len) { break; } - // We've found a code point that's escaped and/or non-ASCII. + char c = text.charAt(i); if (c < 0x80) { - // An escaped ASCII character. - assert escapes[c] != null; - appendAscii(escapes[c]); + // ASCII character that needs escaping + String escape = escapes[c]; + if (escape != null) { + appendAscii(escape); + } else { + // Safe ASCII char that wasn't caught by fast path (shouldn't happen often) + appendAscii(c); + } } else if (c < 0x100) { // Non-ASCII LATIN-1; we will have an escape sequence but may // not use it. - assert escapes[c] != null; - // Always escape the C1 control codes U+80 through U+9F. if (escapeNonAscii || c <= 0x9F) { appendAscii(escapes[c]); @@ -624,8 +616,8 @@ else if (c < 0x100) else if (c < 0xD800 || c >= 0xE000) { // Not LATIN-1, but still in the BMP. - String s = Integer.toHexString(c); if (escapeNonAscii) { + String s = Integer.toHexString(c); appendAscii(HEX_4_PREFIX); appendAscii(ZERO_PADDING[4 - s.length()]); appendAscii(s); @@ -657,14 +649,13 @@ else if (isHighSurrogate(c)) else { // unmatched low surrogate - assert isLowSurrogate(c); - String message = "text is invalid UTF-16. It contains an unmatched " + "trailing surrogate 0x" + Integer.toHexString(c) + " at index " + i; throw new IllegalArgumentException(message); } + i++; } }