On Mon, 10 Jun 2024 14:32:46 GMT, Emanuel Peter <epe...@openjdk.org> wrote:
>> # 1. Compare with unsafe branch >> >> 1. current (`5e815`) >> https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406 >> 2. unsafe (`adc220`) >> https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406_unsafe >> >> I think the performance of the Unsafe branch may be the best data for the C2 >> optimizer. @eme64 can help me see if C2 can do it? >> >> # 2. Benchmark Commands >> >> make test TEST="micro:java.lang.StringBuilders.toStringCharWithBool8" >> make test TEST="micro:java.lang.StringBuilders.toStringCharWithNull8" >> >> >> # 3. Implementation of Unsafe Branch >> >> class AbstractStringBuilder { >> static final Unsafe UNSAFE = Unsafe.getUnsafe(); >> >> static final int NULL_LATIN1; >> static final int TRUE_LATIN1; >> static final int FALS_LATIN1; >> >> static final long NULL_UTF16; >> static final long TRUE_UTF16; >> static final long FALS_UTF16; >> >> static { >> byte[] bytes4 = new byte[4]; >> byte[] bytes8 = new byte[8]; >> >> bytes4[0] = 'n'; >> bytes4[1] = 'u'; >> bytes4[2] = 'l'; >> bytes4[3] = 'l'; >> NULL_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); >> NULL_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> >> bytes4[0] = 't'; >> bytes4[1] = 'r'; >> bytes4[2] = 'u'; >> bytes4[3] = 'e'; >> TRUE_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); >> TRUE_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> >> bytes4[0] = 'f'; >> bytes4[1] = 'a'; >> bytes4[2] = 'l'; >> bytes4[3] = 's'; >> FALS_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); >> FALS_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> } >> >> private AbstractStringBuilder appendNull() { >> ensureCapacityInternal(count + 4); >> int count = this.count; >> byte[] val = this.value; >> if (isLatin1()) { >> UNSAFE.putInt( >> val, >> Unsafe.ARRAY_BYTE_BASE_OFFSET + count, >> NULL_LATIN1); >> } else { >> UNSAFE.putLong( >> val, >> Unsafe.ARRAY_BYTE_BASE_OFFSET + (count << 1), >> NULL_UTF16); >> ... > > @wenshao >> I think the performance of the Unsafe branch may be the best data for the C2 >> optimizer. @eme64 can help me see if C2 can do it? > > Have you tried to see if the optimization actually was done/taken? You can > use the `TraceMergeStores,` flag. Can you present the generated assembly code > of the benchmarks, and explain the difference based on the generated assembly > code? You can run JMH penchmarks with `perf`. These two blogs may help you: > > http://psy-lob-saw.blogspot.com/2015/07/jmh-perfasm.html > https://shipilev.net/blog/2016/arrays-wisdom-ancients/#_meet_jmh_prof_perfasm > > @liach I don't think it makes a difference if it is `int` or `byte` > constants. Or what exactly is the code change you are proposing? @eme64 It seems that when the following code uses StringUTF16.putChar, C2's optimization is not as good as the manual merging and storage effect. class AbstractStringBuilder { private AbstractStringBuilder appendNull() { // ... StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l'); // ... } public AbstractStringBuilder append(boolean b) { // ... StringUTF16.putCharsAt(val, count, 't', 'r', 'u', 'e'); // ... StringUTF16.putCharsAt(val, count, 'f', 'a', 'l', 's', 'e'); // ... } } class StringUTF16 { public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) { putChar(value, i , c1); putChar(value, i + 1, c2); putChar(value, i + 2, c3); putChar(value, i + 3, c4); } @IntrinsicCandidate // intrinsic performs no bounds checks static void putChar(byte[] val, int index, int c) { assert index >= 0 && index < length(val) : "Trusted caller missed bounds check"; index <<= 1; val[index++] = (byte)(c >> HI_BYTE_SHIFT); val[index] = (byte)(c >> LO_BYTE_SHIFT); } } The code for manually merging storage is as follows, without using Unsafe: class AbstractStringBuilder { static final long NULL_UTF16; static final long TRUE_UTF16; static final long FALS_UTF16; static { byte[] bytes = new byte[8]; StringUTF16.putCharsAt(bytes, 0, 'n', 'u', 'l', 'l'); NULL_UTF16 = getLong(bytes, 0); StringUTF16.putCharsAt(bytes, 0, 't', 'r', 'u', 'e'); TRUE_UTF16 = getLong(bytes, 0); StringUTF16.putCharsAt(bytes, 0, 'f', 'a', 'l', 's'); FALS_UTF16 = getLong(bytes, 0); } private static long getLong(byte[] bytes, int offset) { return (((long)bytes[offset ] & 0xff) ) | (((long)bytes[offset + 1] & 0xff) << 8) | (((long)bytes[offset + 2] & 0xff) << 16) | (((long)bytes[offset + 3] & 0xff) << 24) | (((long)bytes[offset + 4] & 0xff) << 32) | (((long)bytes[offset + 5] & 0xff) << 40) | (((long)bytes[offset + 6] & 0xff) << 48) | (((long)bytes[offset + 7] & 0xff) << 56); } private static void setLong(byte[] array, int offset, long value) { array[offset] = (byte) value; array[offset + 1] = (byte) (value >> 8); array[offset + 2] = (byte) (value >> 16); array[offset + 3] = (byte) (value >> 24); array[offset + 4] = (byte) (value >> 32); array[offset + 5] = (byte) (value >> 40); array[offset + 6] = (byte) (value >> 48); array[offset + 7] = (byte) (value >> 56); } private AbstractStringBuilder appendNull() { int count = this.count; ensureCapacityInternal(count + 4); byte[] val = this.value; if (isLatin1()) { val[count ] = 'n'; val[count + 1] = 'u'; val[count + 2] = 'l'; val[count + 3] = 'l'; } else { setLong(val, count, NULL_UTF16); } this.count = count + 4; return this; } public AbstractStringBuilder append(boolean b) { int count = this.count; int spaceNeeded = count + (b ? 4 : 5); ensureCapacityInternal(spaceNeeded); byte[] val = this.value; if (isLatin1()) { if (b) { val[count ] = 't'; val[count + 1] = 'r'; val[count + 2] = 'u'; val[count + 3] = 'e'; } else { val[count ] = 'f'; val[count + 1] = 'a'; val[count + 2] = 'l'; val[count + 3] = 's'; val[count + 4] = 'e'; } } else { setLong(val, count, b ? TRUE_UTF16 : FALS_UTF16); if (!b) { StringUTF16.putChar(val, count + 4, 'e'); } } this.count = spaceNeeded; return this; } } The getLong/setLong methods here can be optimized and merged by C2. Maybe we need a public class that does not use Unsafe to implement these methods. They are needed in many places. Maybe it is appropriate to improve them based on ByteArray/ByteArrayLittleEndian. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2159458425