Heres an attempt to percent-encode as little as possible, but as much as necessary.
It's very much work-in-progress, but may be of some use?
import java.net.URI;
import java.net.URISyntaxException;
import java.util.BitSet;
import java.util.HexFormat;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* A routine to build URI's, percent-encoding only as necessary, as defined in
* <a href=https://www.rfc-editor.org/rfc/rfc3986>RFC 3986</a>.<br>
* <br>
* This focus of this prototype are the entities Query & Fragment.<br>
* Everything else is delegated to the...<br>
* {@link URI#URI(String, String, String, int, String, String, String)}<br>
* ...constructor, passing null for both Query & Fragment.<br>
* <br>
* After correctly percent-encoding Query & Fragment,
* they are appended to
* {@link URI#toString()}.<br>
* <br>
* One exception was made to the RFC 3986 encoding:<br>
* RFC 3986 specifies '&' and '=' are exempt from percent-encoding in Queries.<br>
* But they are both used as delimiters when providing key/value pairs.<br>
* If either key or value should contain these characters,
* parsing the resultant Query could be tricky.<br>
* This class provides a mechanism to percent-encode them in the keys & values,
* whilst leaving them untouched when assembling the key/value pairs.
*/
public final class UriBuilder {
public static void main(final String[] args) throws URISyntaxException {
final var qALL = toString(ENCODING_EXEMPT_4_PLAINTEXT_QUERY);
final var host = "stackoverflow.com";
final var path = "/questions/5330104/encoding-url-query-parameters-in-java";
newBuilder().setScheme("https").setHost(host).setPath(path).build();
newBuilder().setScheme("https").setHost(host).setPath(path).setQuery( "" ) .build();
newBuilder().setScheme("https").setHost(host).setPath(path).setQuery( qALL ) .build();
newBuilder().setScheme("https").setHost(host).setPath(path).setQuery(Query.of(QueryKeyValuePair.of(qALL, ""))) .build();
newBuilder().setScheme("https").setHost(host).setPath(path).setQuery(Query.of(QueryKeyValuePair.of("", ""))) .build();
newBuilder().setScheme("https").setHost(host).setPath(path).setQuery(Query.of() ).setFragment(qALL).build();
}
public static record QueryKeyValuePair(String rawKey, String rawValue, String encoded) {
public static QueryKeyValuePair of(final String key, final String rawValue) {
final var keyValueLength = key.length() + rawValue.length();
final var sb = new StringBuilder(keyValueLength * 3);
if (keyValueLength == 0) {
return null;
}
percentEncode(sb, key, ENCODING_EXEMPT_4_KEY_PAIR_QUERY);
; sb.append('=');
percentEncode(sb, rawValue, ENCODING_EXEMPT_4_KEY_PAIR_QUERY);
return new QueryKeyValuePair(key, rawValue, sb.toString());
}
}
public static record Query(QueryKeyValuePair[] pairs, String rawPlainTextQuery, String encoded) {
public static Query of(final QueryKeyValuePair... pairs) {
if (pairs.length == 0) {
return null;
}
final var percentEncoded = Stream.of(pairs).filter(Objects :: nonNull).map(p -> p.encoded).collect(Collectors.joining("&"));
if (percentEncoded.isEmpty()) {
return null;
} else {
return new Query(pairs, null, percentEncoded);
}
}
public static Query of(final String rawPlainTextQuery) {
if (rawPlainTextQuery.isEmpty()) {
return null;
}
final var sb = new StringBuilder(rawPlainTextQuery.length() * 3);
percentEncode(sb, rawPlainTextQuery, ENCODING_EXEMPT_4_PLAINTEXT_QUERY);
return new Query(null, rawPlainTextQuery, sb.toString());
}
}
public static record Fragment(String fragment, String encoded) {
public static Fragment of(final String fragment) {
if (fragment.isEmpty()) {
return null;
}
final var sb = new StringBuilder(fragment.length() * 3);
percentEncode(sb, fragment, ENCODING_EXEMPT_4_FRAGMENT);
return new Fragment(fragment, sb.toString());
}
}
private static final HexFormat HEX_FORMAT_UPPER = HexFormat.of().withUpperCase();
private static final char REPLACEMENT_CHARACTER_U_FFFD = '\uFFFD';
private static final int RFC_3986_BITSET_LENGTH = 128;
private static final BitSet ENCODING_EXEMPT_4_KEY_PAIR_QUERY;
private static final BitSet ENCODING_EXEMPT_4_PLAINTEXT_QUERY;
private static final BitSet ENCODING_EXEMPT_4_FRAGMENT;
; static {
final var SUB_DELIMS_EXCEPT_AND_EQUALS = bitSetOf('!', '$', '\'', '(', ')', '*', '+', ',', ';', ':', '@');
final var SUB_DELIMS = bitSetOr(SUB_DELIMS_EXCEPT_AND_EQUALS, bitSetOf('&', '='));
final var DIGIT = bitSetRangeInclusive('0', '9');
final var ALPHA = bitSetOr(
bitSetRangeInclusive('A', 'Z'),
bitSetRangeInclusive('a', 'z'));
final var UNRESERVED = bitSetOr(ALPHA, DIGIT, bitSetOf('-', '.', '_', '~'));
/*
* Above we defined the ABNF syntax as defined in RFC 3986 Appendix A.
*
* Now we can combine them to define the percent-encoding exemptions for the various entities...
*/
ENCODING_EXEMPT_4_KEY_PAIR_QUERY = bitSetOr(UNRESERVED, SUB_DELIMS_EXCEPT_AND_EQUALS, bitSetOf('/', '?'));
ENCODING_EXEMPT_4_PLAINTEXT_QUERY = bitSetOr(UNRESERVED, SUB_DELIMS, bitSetOf('/', '?'));
ENCODING_EXEMPT_4_FRAGMENT = ENCODING_EXEMPT_4_PLAINTEXT_QUERY;
}
private static void percentEncode(final StringBuilder sb, final String rawValue, final BitSet exemptFromPercentEncoding) {
rawValue.codePoints().forEach(codePoint -> {
/*
* Surrogate Pairs will have both Surrogates in the Codepoint.
* For orphan Surrogates, the Codepoint will contain only the orphan (d800:dfff).
*
* java.net.URLEncoder percent-encodes orphan Surrogates as "%3F".
* This is the Hex representation of '?' (Question Mark).
*
* Question Mark may, however, be exempt from percent-encoding, so we use '?'.
* Whether or not it is then percent-encoded depends on the exemptions parameter.
*
* TODO You might like to consider using the standard Replacement Character instead.
*/
if (codePoint >>> 11 == 0x1B) { // 0xD8_00 <= codePoint <= 0xDF_FF
codePoint = REPLACEMENT_CHARACTER_U_FFFD; // TODO ?
codePoint = '?';
}
if (exemptFromPercentEncoding.get (codePoint)) {
sb.append ((char) codePoint);
return;
}
for (final var utfByte : encodeTo_UTF_8_bytes(codePoint)) {
sb.append('%');
sb.append(HEX_FORMAT_UPPER.toHexDigits(utfByte));
}
});
}
private static byte[] encodeTo_UTF_8_bytes(int codePoint) {
/*
* See sun.nio.cs.UTF_8 for Legal UTF-8 Byte Sequences.
*
* Note:
* Prior to November 2003, UTF-8 permitted Codepoints requiring one to six Bytes.
* Now, RFC 3629 explicitly prohibits that, allowing for just one to four Bytes.
* That makes UTF-8 & UTF-16 compatible.
* The following logic can, however, handle both paradigms...
*/
if (codePoint < 0x80) {
return new byte[] {(byte) codePoint}; // 1-Byte Codepoints are simple & MUST be excluded here anyway.
}
final var bitCount = Integer.SIZE - Integer.numberOfLeadingZeros(codePoint);
final var utf8byteCount = (bitCount + 3) / 5; // Yields incorrect result for 1-Byte Codepoints (which we excluded, above)
final var utf8firstBytePrefix = 0x3F_00 >>> utf8byteCount; // 2 to 6 1-bits right-shifted into Low-Order Byte, depending on Byte-Count.
final var utf8bytes = new byte[utf8byteCount];
for (int i=utf8byteCount - 1; i >= 0; i--) { // (fill the Byte Array from right to left)
if (i == 0) {
utf8bytes[i] = (byte) (utf8firstBytePrefix | (0x3F & codePoint)); // First-Byte Prefix + trailing 6 bits
} else {
utf8bytes[i] = (byte) (0x80 | (0x3F & codePoint)); // Other-Byte Prefix + trailing 6 bits
}
codePoint >>>= 6; // Shift right to ready the next 6 bits (or, for 1st byte, as many as remain)
}
return utf8bytes;
}
public static final int NULL_PORT = -1;
private String scheme = null;
private String userInfo = null;
private String host = null;
private int port = NULL_PORT;
private String path = null;
public Query query = null;
public Fragment fragment = null;
public UriBuilder setScheme (final String scheme) {this.scheme = scheme; return this;}
public UriBuilder setUserInfo(final String userInfo) {this.userInfo = userInfo; return this;}
public UriBuilder setHost (final String host) {this.host = host; return this;}
public UriBuilder setPort (final int port) {this.port = port; return this;}
public UriBuilder setPath (final String path) {this.path = path; return this;}
public UriBuilder setQuery (final Query query) {this.query = query; return this;}
public UriBuilder setQuery (final String rawQuery) {this.query = Query .of(rawQuery); return this;}
public UriBuilder setFragment(final String fragment) {this.fragment = Fragment.of(fragment); return this;}
public URI build() throws URISyntaxException {
final var prefixURI = new URI(this.scheme, this.userInfo, this.host, this.port, this.path, /* Query */ null, /* Fragment */ null);
final var sb = new StringBuilder(prefixURI.toString());
if (this.query != null) {
sb.append('?').append(this.query .encoded);
}
if (this.fragment != null) {
sb.append('#').append(this.fragment.encoded);
}
final var uri = new URI(sb.toString());
System.out.println("Native.....: " + prefixURI);
System.out.println("Generated..: " + uri);
System.out.println();
return uri;
}
public static UriBuilder newBuilder() {
return new UriBuilder();
}
private static BitSet bitSetOf(final int... bitIndices) {
return IntStream.of(bitIndices).collect(() -> new BitSet(RFC_3986_BITSET_LENGTH), BitSet :: set, BitSet :: or);
}
private static BitSet bitSetOr(final BitSet... bitSets) {
return Stream.of(bitSets) .collect(() -> new BitSet(RFC_3986_BITSET_LENGTH), BitSet :: or, BitSet :: or);
}
private static BitSet bitSetRangeInclusive(final int fromIndex, final int toIndex) {
final var newBitSet = new BitSet(RFC_3986_BITSET_LENGTH);
; newBitSet.set(fromIndex, toIndex + 1);
return newBitSet;
}
private static String toString(final BitSet bitSet) {
return bitSet.stream().collect(StringBuilder :: new, (s, i) -> s.append((char) i), StringBuilder :: append).toString();
}
}