79643948

Date: 2025-05-29 13:12:46
Score: 2
Natty:
Report link

Heres an attempt to percent-encode as little as possible, but as much as necessary.

It's very much work-in-progress, but may be of some use?

import java.net.URI;
import java.net.URISyntaxException;
import java.util.BitSet;
import java.util.HexFormat;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

/**
 * A routine to build URI's, percent-encoding only as necessary, as defined in
 * <a href=https://www.rfc-editor.org/rfc/rfc3986>RFC 3986</a>.<br>
 * <br>
 * This focus of this prototype are the entities Query & Fragment.<br>
 * Everything else is delegated to the...<br>
 * {@link  URI#URI(String, String, String, int, String, String, String)}<br>
 * ...constructor, passing null for both Query & Fragment.<br>
 * <br>
 * After correctly percent-encoding Query & Fragment,
 * they are appended to
 * {@link URI#toString()}.<br>
 * <br>
 * One exception was made to the RFC 3986 encoding:<br>
 * RFC 3986 specifies '&' and '=' are exempt from percent-encoding in Queries.<br>
 * But they are both used as delimiters when providing key/value pairs.<br>
 * If either key or value should contain these characters,
 * parsing the resultant Query could be tricky.<br>
 * This class provides a mechanism to percent-encode them in the keys & values,
 * whilst leaving them untouched when assembling the key/value pairs.
 */
public final class UriBuilder {

    public static void main(final String[] args) throws URISyntaxException {

        final var qALL      = toString(ENCODING_EXEMPT_4_PLAINTEXT_QUERY);

        final var host      = "stackoverflow.com";
        final var path      = "/questions/5330104/encoding-url-query-parameters-in-java";

        newBuilder().setScheme("https").setHost(host).setPath(path).build();
        newBuilder().setScheme("https").setHost(host).setPath(path).setQuery(                              ""        )                  .build();
        newBuilder().setScheme("https").setHost(host).setPath(path).setQuery(                              qALL      )                  .build();
        newBuilder().setScheme("https").setHost(host).setPath(path).setQuery(Query.of(QueryKeyValuePair.of(qALL, "")))                  .build();
        newBuilder().setScheme("https").setHost(host).setPath(path).setQuery(Query.of(QueryKeyValuePair.of("",   "")))                  .build();
        newBuilder().setScheme("https").setHost(host).setPath(path).setQuery(Query.of()                              ).setFragment(qALL).build();
    }

    public static record QueryKeyValuePair(String rawKey, String rawValue, String encoded) {

        public static QueryKeyValuePair of(final String key, final String rawValue) {

            final var     keyValueLength = key.length() + rawValue.length();
            final var     sb             = new StringBuilder(keyValueLength * 3);

            if (keyValueLength == 0) {
                return null;
            }
            percentEncode(sb, key,      ENCODING_EXEMPT_4_KEY_PAIR_QUERY);
            ;             sb.append('=');
            percentEncode(sb, rawValue, ENCODING_EXEMPT_4_KEY_PAIR_QUERY);

            return new QueryKeyValuePair(key, rawValue, sb.toString());
        }
    }

    public static record Query(QueryKeyValuePair[] pairs, String rawPlainTextQuery, String encoded) {

        public static Query of(final QueryKeyValuePair... pairs) {

            if (pairs.length == 0) {
                return null;
            }
            final var percentEncoded = Stream.of(pairs).filter(Objects :: nonNull).map(p -> p.encoded).collect(Collectors.joining("&"));

            if (percentEncoded.isEmpty()) {
                return null;
            } else {
                return new Query(pairs, null, percentEncoded);
            }
        }
        public static Query of(final String rawPlainTextQuery) {

            if (rawPlainTextQuery.isEmpty()) {
                return null;
            }
            final var     sb = new StringBuilder(rawPlainTextQuery.length() * 3);

            percentEncode(sb, rawPlainTextQuery, ENCODING_EXEMPT_4_PLAINTEXT_QUERY);

            return new Query(null, rawPlainTextQuery, sb.toString());
        }
    }

    public static record Fragment(String fragment, String encoded) {

        public static Fragment of(final String fragment) {

            if (fragment.isEmpty()) {
                return null;
            }
            final var     sb = new StringBuilder(fragment.length() * 3);

            percentEncode(sb, fragment, ENCODING_EXEMPT_4_FRAGMENT);

            return new Fragment(fragment, sb.toString());
        }
    }

    private static final HexFormat HEX_FORMAT_UPPER             = HexFormat.of().withUpperCase();

    private static final char      REPLACEMENT_CHARACTER_U_FFFD = '\uFFFD';
    private static final int       RFC_3986_BITSET_LENGTH       = 128;

    private static final BitSet    ENCODING_EXEMPT_4_KEY_PAIR_QUERY;
    private static final BitSet    ENCODING_EXEMPT_4_PLAINTEXT_QUERY;
    private static final BitSet    ENCODING_EXEMPT_4_FRAGMENT;
    ;       static {
        final var SUB_DELIMS_EXCEPT_AND_EQUALS = bitSetOf('!',  '$',  '\'',  '(',  ')',  '*',  '+',  ',',  ';',  ':',  '@');
        final var SUB_DELIMS                   = bitSetOr(SUB_DELIMS_EXCEPT_AND_EQUALS, bitSetOf('&',   '='));

        final var DIGIT                        = bitSetRangeInclusive('0', '9');
        final var ALPHA                        = bitSetOr(
                bitSetRangeInclusive('A', 'Z'),
                bitSetRangeInclusive('a', 'z'));

        final var UNRESERVED                   = bitSetOr(ALPHA, DIGIT, bitSetOf('-',  '.',  '_',  '~'));
        /*
         * Above we defined the ABNF syntax as defined in RFC 3986 Appendix A.
         * 
         * Now we can combine them to define the percent-encoding exemptions for the various entities...
         */
        ENCODING_EXEMPT_4_KEY_PAIR_QUERY       = bitSetOr(UNRESERVED, SUB_DELIMS_EXCEPT_AND_EQUALS, bitSetOf('/',  '?'));
        ENCODING_EXEMPT_4_PLAINTEXT_QUERY      = bitSetOr(UNRESERVED, SUB_DELIMS,                   bitSetOf('/',  '?'));

        ENCODING_EXEMPT_4_FRAGMENT             = ENCODING_EXEMPT_4_PLAINTEXT_QUERY;
    }

    private static void percentEncode(final StringBuilder sb, final String rawValue, final BitSet exemptFromPercentEncoding) {

        rawValue.codePoints().forEach(codePoint -> {
            /*
             * Surrogate Pairs will have both Surrogates in the Codepoint.
             * For orphan Surrogates, the Codepoint will contain only the orphan (d800:dfff).
             * 
             * java.net.URLEncoder percent-encodes orphan Surrogates as "%3F".
             * This is the Hex representation of '?' (Question Mark).
             * 
             * Question Mark may, however, be exempt from percent-encoding, so we use '?'.
             * Whether or not it is then percent-encoded depends on the exemptions parameter.
             * 
             * TODO You might like to consider using the standard Replacement Character instead.
             */
            if (codePoint >>> 11 == 0x1B) {               // 0xD8_00 <= codePoint <= 0xDF_FF
                codePoint = REPLACEMENT_CHARACTER_U_FFFD; // TODO ?
                codePoint = '?';
            }
            if (exemptFromPercentEncoding.get            (codePoint)) {
                sb.append                         ((char) codePoint);
                return;
            }
            for (final var utfByte : encodeTo_UTF_8_bytes(codePoint)) {
                sb.append('%');
                sb.append(HEX_FORMAT_UPPER.toHexDigits(utfByte));
            }
        });
    }

    private static byte[] encodeTo_UTF_8_bytes(int codePoint) {
        /*
         * See sun.nio.cs.UTF_8 for Legal UTF-8 Byte Sequences.
         * 
         * Note:
         * Prior to November 2003, UTF-8 permitted Codepoints requiring one to six Bytes.
         * Now, RFC 3629 explicitly prohibits that, allowing for just one to four Bytes.
         * That makes UTF-8 & UTF-16 compatible.
         * The following logic can, however, handle both paradigms...
         */
        if (codePoint < 0x80) {
            return new byte[] {(byte) codePoint}; // 1-Byte Codepoints are simple & MUST be excluded here anyway.
        }
        final var bitCount            = Integer.SIZE - Integer.numberOfLeadingZeros(codePoint);
        final var utf8byteCount       = (bitCount + 3) / 5;        // Yields incorrect result for 1-Byte Codepoints (which we excluded, above)
        final var utf8firstBytePrefix = 0x3F_00 >>> utf8byteCount; // 2 to 6 1-bits right-shifted into Low-Order Byte, depending on Byte-Count.

        final var utf8bytes           = new byte[utf8byteCount];

        for (int i=utf8byteCount - 1; i >= 0; i--) { // (fill the Byte Array from right to left)

            if (i == 0) {
                utf8bytes[i] = (byte) (utf8firstBytePrefix | (0x3F  &  codePoint)); // First-Byte Prefix + trailing 6 bits
            } else {
                utf8bytes[i] = (byte) (0x80                | (0x3F  &  codePoint)); // Other-Byte Prefix + trailing 6 bits
            }
            codePoint >>>= 6;  // Shift right to ready the next 6 bits (or, for 1st byte, as many as remain)
        }
        return  utf8bytes;
    }

    public  static final int      NULL_PORT = -1;

    private              String   scheme    = null;
    private              String   userInfo  = null;
    private              String   host      = null;
    private              int      port      = NULL_PORT;
    private              String   path      = null;
    public               Query    query     = null;
    public               Fragment fragment  = null;

    public  UriBuilder setScheme  (final String scheme)   {this.scheme   =             scheme;    return this;}
    public  UriBuilder setUserInfo(final String userInfo) {this.userInfo =             userInfo;  return this;}
    public  UriBuilder setHost    (final String host)     {this.host     =             host;      return this;}
    public  UriBuilder setPort    (final int    port)     {this.port     =             port;      return this;}
    public  UriBuilder setPath    (final String path)     {this.path     =             path;      return this;}
    public  UriBuilder setQuery   (final Query  query)    {this.query    =             query;     return this;}
    public  UriBuilder setQuery   (final String rawQuery) {this.query    = Query   .of(rawQuery); return this;}
    public  UriBuilder setFragment(final String fragment) {this.fragment = Fragment.of(fragment); return this;}

    public  URI build() throws URISyntaxException {

        final var prefixURI = new URI(this.scheme, this.userInfo, this.host, this.port, this.path, /* Query  */ null, /* Fragment  */ null);

        final var sb        = new StringBuilder(prefixURI.toString());

        if (this.query    != null) {
            sb.append('?').append(this.query   .encoded);
        }
        if (this.fragment != null) {
            sb.append('#').append(this.fragment.encoded);
        }
        final var uri = new URI(sb.toString());

        System.out.println("Native.....: " + prefixURI);
        System.out.println("Generated..: " + uri);
        System.out.println();

        return    uri;
    }

    public  static  UriBuilder newBuilder() {
        return new UriBuilder();
    }

    private static BitSet bitSetOf(final int...    bitIndices) {
        return IntStream.of(bitIndices).collect(() -> new BitSet(RFC_3986_BITSET_LENGTH), BitSet :: set, BitSet :: or);
    }

    private static BitSet bitSetOr(final BitSet... bitSets) {
        return    Stream.of(bitSets)   .collect(() -> new BitSet(RFC_3986_BITSET_LENGTH), BitSet :: or,  BitSet :: or);
    }

    private static BitSet bitSetRangeInclusive(final int fromIndex, final int toIndex) {

        final var newBitSet =                         new BitSet(RFC_3986_BITSET_LENGTH);
        ;         newBitSet.set(fromIndex, toIndex + 1);
        return    newBitSet;
    }

    private static String toString(final BitSet bitSet) {
        return bitSet.stream().collect(StringBuilder :: new, (s, i) -> s.append((char) i), StringBuilder :: append).toString();
    }
}

Reasons:
  • Blacklisted phrase (1): stackoverflow
  • Long answer (-1):
  • Has code block (-0.5):
  • Ends in question mark (2):
  • Low reputation (0.5):
Posted by: Dave The Dane