From ad3a5bed4f134aa6c6f2143808af85c0fe081336 Mon Sep 17 00:00:00 2001 From: Michael Dowling Date: Tue, 19 May 2026 15:38:09 -0500 Subject: [PATCH] Optimize query and URI encoding/validation Switch to masks instead of arrays, and write directly to StringBuilder when building up a query string. --- .../java/io/uri/QueryStringBuilder.java | 111 ++++++------ .../amazon/smithy/java/io/uri/SmithyUri.java | 162 +++++++++++------- .../smithy/java/io/uri/URLEncoding.java | 68 ++++++-- 3 files changed, 212 insertions(+), 129 deletions(-) diff --git a/io/src/main/java/software/amazon/smithy/java/io/uri/QueryStringBuilder.java b/io/src/main/java/software/amazon/smithy/java/io/uri/QueryStringBuilder.java index 57cbcf29f..e3bc7f203 100644 --- a/io/src/main/java/software/amazon/smithy/java/io/uri/QueryStringBuilder.java +++ b/io/src/main/java/software/amazon/smithy/java/io/uri/QueryStringBuilder.java @@ -5,95 +5,101 @@ package software.amazon.smithy.java.io.uri; -import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; /** * Used to build a query string from key value pair parameters. */ public final class QueryStringBuilder { - private final List values = new ArrayList<>(); - private final Set keysFromHttpQuery = new HashSet<>(); + private final StringBuilder builder = new StringBuilder(); + private final HashSet httpQueryKeys = new HashSet<>(); + private boolean empty = true; /** - * Clears the contents of the query string builder. + * Clears the contents of the query string builder so it can be reused. */ public void clear() { - values.clear(); + builder.setLength(0); + empty = true; + httpQueryKeys.clear(); } /** * Add a query string parameter and value to the query string. - *

- * The given key and value will be percent-encoded. If the value is already percent-encoded, it will be - * double percent-encoded. + * + *

The given key and value are percent-encoded. If the value is already + * percent-encoded, it will be double percent-encoded. * * @param key Key of the parameter. - * @param value Value of the parameter (or null). + * @param value Value of the parameter (or null, which appends {@code key=}). */ public void add(String key, String value) { - values.add(key); - keysFromHttpQuery.add(key); - values.add(value); + append(key, value); + httpQueryKeys.add(key); } /** - * Add a query string parameter and value to the query string comes from httpQueryParams trait. - *

- * The given key and value will be percent-encoded. If the value is already percent-encoded, it will be - * double percent-encoded. Query string parameters from httpQuery should take precedence if there are - * duplicate keys from @httpQuery. + * Add multiple values for the same key. * - * @param key Key of the parameter. - * @param value Value of the parameter (or null). + * @param key Key of the parameter. + * @param values Values to add (each is added as a separate {@code key=value} pair). */ - public void addForQueryParams(String key, String value) { - if (!keysFromHttpQuery.contains(key)) { - values.add(key); - values.add(value); + public void add(String key, List values) { + for (String v : values) { + add(key, v); } } /** - * Add a query string parameter and values to the query string. - *

- * The given key and values will be percent-encoded. If the values are already percent-encoded, it will be - * double percent-encoded. + * Add all entries from a map of {@code key -> [value, ...]} to the query string. * - * @param key Key of the parameter. - * @param values List of values + * @param values Map of keys to lists of values. */ - public void add(String key, List values) { - for (String value : values) { - add(key, value); + public void add(Map> values) { + for (var entry : values.entrySet()) { + add(entry.getKey(), entry.getValue()); } } /** - * Add a query string parameter and values to the query string. - *

- * The given key and values will be percent-encoded. If the values are already percent-encoded, it will be - * double percent-encoded. + * Add a query string parameter and value from a {@code @httpQueryParams} member. + * + *

Skips the key if it was already added via {@link #add} so {@code @httpQuery} + * members take precedence over {@code @httpQueryParams} entries with the same key. * - * @param values List of values + * @param key Key of the parameter. + * @param value Value of the parameter (or null). */ - public void add(Map> values) { - for (var entry : values.entrySet()) { - add(entry.getKey(), entry.getValue()); + public void addForQueryParams(String key, String value) { + if (!httpQueryKeys.contains(key)) { + append(key, value); + } + } + + private void append(String key, String value) { + if (!empty) { + builder.append('&'); + } else { + empty = false; + } + + URLEncoding.encodeUnreserved(key, builder, false); + builder.append('='); + if (value != null) { + URLEncoding.encodeUnreserved(value, builder, false); } } /** * Check if the query string is empty. * - * @return Returns true if empty. + * @return Returns true if no parameters have been added. */ public boolean isEmpty() { - return values.isEmpty(); + return empty; } /** @@ -103,28 +109,15 @@ public boolean isEmpty() { */ @Override public String toString() { - StringBuilder result = new StringBuilder(); - write(result); - return result.toString(); + return builder.toString(); } /** - * Write the query string directly to a string builder. + * Append the query string directly to a string builder. * * @param sink Where to write. */ public void write(StringBuilder sink) { - for (int i = 0; i < values.size(); i += 2) { - if (i > 0) { - sink.append('&'); - } - encode(values.get(i), sink); - sink.append('='); - encode(values.get(i + 1), sink); - } - } - - private void encode(String raw, StringBuilder builder) { - URLEncoding.encodeUnreserved(raw, builder, false); + sink.append(builder); } } diff --git a/io/src/main/java/software/amazon/smithy/java/io/uri/SmithyUri.java b/io/src/main/java/software/amazon/smithy/java/io/uri/SmithyUri.java index d943eb680..4b80833da 100644 --- a/io/src/main/java/software/amazon/smithy/java/io/uri/SmithyUri.java +++ b/io/src/main/java/software/amazon/smithy/java/io/uri/SmithyUri.java @@ -28,84 +28,130 @@ */ public final class SmithyUri { - // Lookup tables for fast character validation (ASCII range only). - private static final boolean[] SCHEME_CONT = new boolean[128]; - private static final boolean[] INVALID_HOST = new boolean[128]; - private static final boolean[] VALID_PATH; - private static final boolean[] VALID_QUERY; - private static final boolean[] VALID_USERINFO; - private static final boolean[] IS_HEX = new boolean[128]; + // ASCII (0..127) character classes encoded as 128-bit tables split into two longs each. + // Bit `c` of *_LOW (for c in 0..63) or bit `c - 64` of *_HIGH (for c in 64..127) is set iff `c` is in the class. + private static final long SCHEME_CONT_LOW; + private static final long SCHEME_CONT_HIGH; + private static final long INVALID_HOST_LOW; + private static final long INVALID_HOST_HIGH; + private static final long VALID_USERINFO_LOW; + private static final long VALID_USERINFO_HIGH; + private static final long VALID_PATH_LOW; + private static final long VALID_PATH_HIGH; + private static final long VALID_QUERY_LOW; + private static final long VALID_QUERY_HIGH; + private static final long IS_HEX_LOW; + private static final long IS_HEX_HIGH; static { + // Working accumulators built up via setBit, then frozen into the static fields below. + long schLow = 0, schHigh = 0; + long badHostLow = 0, badHostHigh = 0; + long uiLow = 0, uiHigh = 0; + long pathLow = 0, pathHigh = 0; + long queryLow = 0, queryHigh = 0; + long hexLow = 0, hexHigh = 0; + // Scheme continuation: lowercase alpha, digits, +, -, . - SCHEME_CONT['+'] = true; - SCHEME_CONT['-'] = true; - SCHEME_CONT['.'] = true; - - INVALID_HOST['/'] = true; - INVALID_HOST['?'] = true; - INVALID_HOST['#'] = true; - INVALID_HOST['%'] = true; - INVALID_HOST['['] = true; - INVALID_HOST[']'] = true; - INVALID_HOST[' '] = true; - INVALID_HOST['\t'] = true; - INVALID_HOST['\n'] = true; - INVALID_HOST['\r'] = true; + schLow |= 1L << '+'; + schLow |= 1L << '-'; + schLow |= 1L << '.'; + + // Invalid host characters. + badHostLow |= 1L << '/'; + badHostLow |= 1L << '?'; + badHostLow |= 1L << '#'; + badHostLow |= 1L << '%'; + badHostHigh |= 1L << ('[' - 64); + badHostHigh |= 1L << (']' - 64); + badHostLow |= 1L << ' '; + badHostLow |= 1L << '\t'; + badHostLow |= 1L << '\n'; + badHostLow |= 1L << '\r'; // RFC 3986 shared base: unreserved / pct-encoded / sub-delims // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" - VALID_USERINFO = new boolean[128]; - VALID_PATH = new boolean[128]; - VALID_QUERY = new boolean[128]; for (char c = 'a'; c <= 'z'; c++) { - SCHEME_CONT[c] = true; - VALID_USERINFO[c] = true; - VALID_PATH[c] = true; - VALID_QUERY[c] = true; + schHigh |= 1L << (c - 64); + uiHigh |= 1L << (c - 64); + pathHigh |= 1L << (c - 64); + queryHigh |= 1L << (c - 64); } + for (char c = 'A'; c <= 'Z'; c++) { - VALID_USERINFO[c] = true; - VALID_PATH[c] = true; - VALID_QUERY[c] = true; + uiHigh |= 1L << (c - 64); + pathHigh |= 1L << (c - 64); + queryHigh |= 1L << (c - 64); } + for (char c = '0'; c <= '9'; c++) { - VALID_USERINFO[c] = true; - VALID_PATH[c] = true; - VALID_QUERY[c] = true; - SCHEME_CONT[c] = true; + uiLow |= 1L << c; + pathLow |= 1L << c; + queryLow |= 1L << c; + schLow |= 1L << c; } + for (char c : "-._~!$&'()*+,;=%".toCharArray()) { - VALID_USERINFO[c] = true; - VALID_PATH[c] = true; - VALID_QUERY[c] = true; + if (c < 64) { + uiLow |= 1L << c; + pathLow |= 1L << c; + queryLow |= 1L << c; + } else { + uiHigh |= 1L << (c - 64); + pathHigh |= 1L << (c - 64); + queryHigh |= 1L << (c - 64); + } } // userinfo adds ":" - VALID_USERINFO[':'] = true; + uiLow |= 1L << ':'; // path adds ":" / "@" / "/" - VALID_PATH[':'] = true; - VALID_PATH['@'] = true; - VALID_PATH['/'] = true; + pathLow |= 1L << ':'; + pathHigh |= 1L << ('@' - 64); + pathLow |= 1L << '/'; // query adds everything path has plus "?" - VALID_QUERY[':'] = true; - VALID_QUERY['@'] = true; - VALID_QUERY['/'] = true; - VALID_QUERY['?'] = true; + queryLow |= 1L << ':'; + queryHigh |= 1L << ('@' - 64); + queryLow |= 1L << '/'; + queryLow |= 1L << '?'; - // Hex setup for (char c = '0'; c <= '9'; c++) { - IS_HEX[c] = true; + hexLow |= 1L << c; } + for (char c = 'a'; c <= 'f'; c++) { - IS_HEX[c] = true; + hexHigh |= 1L << (c - 64); } + for (char c = 'A'; c <= 'F'; c++) { - IS_HEX[c] = true; + hexHigh |= 1L << (c - 64); } + + SCHEME_CONT_LOW = schLow; + SCHEME_CONT_HIGH = schHigh; + INVALID_HOST_LOW = badHostLow; + INVALID_HOST_HIGH = badHostHigh; + VALID_USERINFO_LOW = uiLow; + VALID_USERINFO_HIGH = uiHigh; + VALID_PATH_LOW = pathLow; + VALID_PATH_HIGH = pathHigh; + VALID_QUERY_LOW = queryLow; + VALID_QUERY_HIGH = queryHigh; + IS_HEX_LOW = hexLow; + IS_HEX_HIGH = hexHigh; + } + + /** + * Returns true iff {@code c} is a member of the class encoded by the (low, high) mask + * pair. Caller guarantees {@code 0 <= c < 128}. + */ + private static boolean maskContains(long low, long high, int c) { + return c < 64 + ? ((low >>> c) & 1L) != 0L + : ((high >>> (c - 64)) & 1L) != 0L; } private final String scheme; @@ -657,14 +703,14 @@ private static void validateScheme(String scheme) { for (int i = 1; i < scheme.length(); i++) { char c = scheme.charAt(i); - if (c >= 128 || !SCHEME_CONT[c]) { + if (c >= 128 || !maskContains(SCHEME_CONT_LOW, SCHEME_CONT_HIGH, c)) { throw new IllegalArgumentException("Invalid character in scheme: '" + c + "' in " + scheme); } } } private static void validateUserInfo(String userInfo) { - validateComponent(userInfo, VALID_USERINFO, "userInfo"); + validateComponent(userInfo, VALID_USERINFO_LOW, VALID_USERINFO_HIGH, "userInfo"); } private static void validateHost(String host) { @@ -677,7 +723,7 @@ private static void validateHost(String host) { if (c >= 'A' && c <= 'Z') { throw new IllegalArgumentException("Host must be lowercase: " + host); } - if (c < 128 && INVALID_HOST[c]) { + if (c < 128 && maskContains(INVALID_HOST_LOW, INVALID_HOST_HIGH, c)) { throw new IllegalArgumentException("Invalid character in host: '" + c + "' in " + host); } } @@ -690,17 +736,17 @@ private static void validatePort(int port) { } private static void validatePath(String path) { - validateComponent(path, VALID_PATH, "path"); + validateComponent(path, VALID_PATH_LOW, VALID_PATH_HIGH, "path"); } private static void validateQuery(String query) { - validateComponent(query, VALID_QUERY, "query"); + validateComponent(query, VALID_QUERY_LOW, VALID_QUERY_HIGH, "query"); } - private static void validateComponent(String value, boolean[] allowed, String component) { + private static void validateComponent(String value, long allowedLow, long allowedHigh, String component) { for (int i = 0; i < value.length(); i++) { char c = value.charAt(i); - if (c >= 128 || !allowed[c]) { + if (c >= 128 || !maskContains(allowedLow, allowedHigh, c)) { throw new IllegalArgumentException("Invalid character in " + component + ": '" + c + "'"); } if (c == '%') { @@ -719,6 +765,6 @@ private static void validatePercentEncoding(String value, int i, String componen } private static boolean isHex(char c) { - return c < 128 && IS_HEX[c]; + return c < 128 && maskContains(IS_HEX_LOW, IS_HEX_HIGH, c); } } diff --git a/io/src/main/java/software/amazon/smithy/java/io/uri/URLEncoding.java b/io/src/main/java/software/amazon/smithy/java/io/uri/URLEncoding.java index 456bd416d..a64523c08 100644 --- a/io/src/main/java/software/amazon/smithy/java/io/uri/URLEncoding.java +++ b/io/src/main/java/software/amazon/smithy/java/io/uri/URLEncoding.java @@ -14,23 +14,44 @@ public final class URLEncoding { private URLEncoding() {} - private static final boolean[] UNRESERVED = new boolean[128]; + // RFC 3986 unreserved character set encoded as a 128-bit table split into two longs. + // Bit `c` of UNRESERVED_LOW (for c in 0..63) or bit `c - 64` of UNRESERVED_HIGH + // (for c in 64..127) is set iff `c` is unreserved. + private static final long UNRESERVED_LOW; + private static final long UNRESERVED_HIGH; private static final char[] HEX = "0123456789ABCDEF".toCharArray(); static { + long low = 0L; + long high = 0L; for (char c = 'A'; c <= 'Z'; c++) { - UNRESERVED[c] = true; + high |= 1L << (c - 64); } for (char c = 'a'; c <= 'z'; c++) { - UNRESERVED[c] = true; + high |= 1L << (c - 64); } for (char c = '0'; c <= '9'; c++) { - UNRESERVED[c] = true; + low |= 1L << c; } - UNRESERVED['-'] = true; - UNRESERVED['.'] = true; - UNRESERVED['_'] = true; - UNRESERVED['~'] = true; + low |= 1L << '-'; + low |= 1L << '.'; + high |= 1L << ('_' - 64); + high |= 1L << ('~' - 64); + UNRESERVED_LOW = low; + UNRESERVED_HIGH = high; + } + + /** + * Returns true iff {@code c} is an RFC 3986 unreserved ASCII character. + */ + private static boolean isUnreserved(int c) { + if (c >= 128) { + return false; + } + + return c < 64 + ? ((UNRESERVED_LOW >>> c) & 1L) != 0L + : ((UNRESERVED_HIGH >>> (c - 64)) & 1L) != 0L; } /** @@ -42,10 +63,33 @@ private URLEncoding() {} * @param preserveSlashes true if '/' should be left unencoded. */ public static void encodeUnreserved(String source, StringBuilder sink, boolean preserveSlashes) { - sink.ensureCapacity(sink.length() + source.length()); - for (int i = 0; i < source.length(); i++) { + int len = source.length(); + sink.ensureCapacity(sink.length() + len); + + // Fast path: skip encoding if the input is already URL-safe. + int i = 0; + while (i < len) { + char c = source.charAt(i); + if (isUnreserved(c) || (preserveSlashes && c == '/')) { + i++; + } else { + break; + } + } + + if (i == len) { + sink.append(source); + return; + } + + if (i > 0) { + sink.append(source, 0, i); + } + + // Slow path: encode the remainder character-by-character. + for (; i < len; i++) { char c = source.charAt(i); - if (c < 128 && UNRESERVED[c]) { + if (isUnreserved(c)) { sink.append(c); } else if (preserveSlashes && c == '/') { sink.append('/'); @@ -53,7 +97,7 @@ public static void encodeUnreserved(String source, StringBuilder sink, boolean p percentEncode(sink, (byte) c); } else { int codePoint; - if (Character.isHighSurrogate(c) && i + 1 < source.length()) { + if (Character.isHighSurrogate(c) && i + 1 < len) { char d = source.charAt(i + 1); if (Character.isLowSurrogate(d)) { codePoint = Character.toCodePoint(c, d);