KurtYoung commented on a change in pull request #8689: [FLINK-12802][table-runtime-blink] Reducing the Code of BinaryString URL: https://github.com/apache/flink/pull/8689#discussion_r293718259
########## File path: flink-table/flink-table-runtime-blink/src/main/java/org/apache/flink/table/dataformat/BinaryString.java ########## @@ -654,531 +498,211 @@ private BinaryString trimSlow() { } /** - * Walk each character of current string from both ends, remove the character if it - * is in trim string. - * Return the new substring which both ends trim characters have been removed. + * Returns the index within this string of the first occurrence of the + * specified substring, starting at the specified index. * - * @param trimStr the trim string - * @return A subString which both ends trim characters have been removed. + * @param str the substring to search for. + * @param fromIndex the index from which to start the search. + * @return the index of the first occurrence of the specified substring, + * starting at the specified index, + * or {@code -1} if there is no such occurrence. */ - public BinaryString trim(BinaryString trimStr) { - if (trimStr == null) { - return null; - } - return trimLeft(trimStr).trimRight(trimStr); - } - - public BinaryString trimLeft() { + public int indexOf(BinaryString str, int fromIndex) { ensureMaterialized(); + str.ensureMaterialized(); + if (str.sizeInBytes == 0) { + return 0; + } if (inFirstSegment()) { - int s = 0; - // skip all of the space (0x20) in the left side - while (s < this.sizeInBytes && getByteOneSegment(s) == 0x20) { - s++; - } - if (s == this.sizeInBytes) { - // empty string - return EMPTY_UTF8; - } else { - return copyBinaryStringInOneSeg(s, this.sizeInBytes - 1); + // position in byte + int byteIdx = 0; + // position is char + int charIdx = 0; + while (byteIdx < sizeInBytes && charIdx < fromIndex) { + byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx)); + charIdx++; } + do { + if (byteIdx + str.sizeInBytes > sizeInBytes) { + return -1; + } + if (SegmentsUtil.equals(segments, offset + byteIdx, + str.segments, str.offset, str.sizeInBytes)) { + return charIdx; + } + byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx)); + charIdx++; + } while (byteIdx < sizeInBytes); + + return -1; } else { - return trimLeftSlow(); + return indexOfMultiSegs(str, fromIndex); } } - private BinaryString trimLeftSlow() { - int s = 0; + private int indexOfMultiSegs(BinaryString str, int fromIndex) { + // position in byte + int byteIdx = 0; + // position is char + int charIdx = 0; int segSize = segments[0].size(); - SegmentAndOffset front = firstSegmentAndOffset(segSize); - // skip all of the space (0x20) in the left side - while (s < this.sizeInBytes && front.value() == 0x20) { - s++; - front.nextByte(segSize); - } - if (s == this.sizeInBytes) { - // empty string - return EMPTY_UTF8; - } else { - return copyBinaryString(s, this.sizeInBytes - 1); + SegmentAndOffset index = firstSegmentAndOffset(segSize); + while (byteIdx < sizeInBytes && charIdx < fromIndex) { + int charBytes = numBytesForFirstByte(index.value()); + byteIdx += charBytes; + charIdx++; + index.skipBytes(charBytes, segSize); } + do { + if (byteIdx + str.sizeInBytes > sizeInBytes) { + return -1; + } + if (SegmentsUtil.equals(segments, offset + byteIdx, + str.segments, str.offset, str.sizeInBytes)) { + return charIdx; + } + int charBytes = numBytesForFirstByte(index.segment.get(index.offset)); + byteIdx += charBytes; + charIdx++; + index.skipBytes(charBytes, segSize); + } while (byteIdx < sizeInBytes); + + return -1; } /** - * Walk each character of current string from left end, remove the character if it - * is in trim string. Stops at the first character which is not in trim string. - * Return the new substring. + * Converts all of the characters in this {@code BinaryString} to upper case. * - * @param trimStr the trim string - * @return A subString which removes all of the character from the left side that is in - * trim string. + * @return the {@code BinaryString}, converted to uppercase. */ - public BinaryString trimLeft(BinaryString trimStr) { - ensureMaterialized(); - if (trimStr == null) { - return null; + public BinaryString toUpperCase() { + if (javaObject != null) { + return javaToUpperCase(); } - trimStr.ensureMaterialized(); - if (trimStr.isSpaceString()) { - return trimLeft(); + if (sizeInBytes == 0) { + return EMPTY_UTF8; } - if (inFirstSegment()) { - int searchIdx = 0; - while (searchIdx < this.sizeInBytes) { - int charBytes = numBytesForFirstByte(getByteOneSegment(searchIdx)); - BinaryString currentChar = copyBinaryStringInOneSeg(searchIdx, - searchIdx + charBytes - 1); - // try to find the matching for the character in the trimString characters. - if (trimStr.contains(currentChar)) { - searchIdx += charBytes; - } else { - break; - } + int size = segments[0].size(); + SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); + byte[] bytes = new byte[sizeInBytes]; + bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value()); + for (int i = 0; i < sizeInBytes; i++) { + byte b = segmentAndOffset.value(); + if (numBytesForFirstByte(b) != 1) { + // fallback + return javaToUpperCase(); } - // empty string - if (searchIdx >= sizeInBytes) { - return EMPTY_UTF8; - } else { - return copyBinaryStringInOneSeg(searchIdx, sizeInBytes - 1); + int upper = Character.toUpperCase((int) b); + if (upper > 127) { + // fallback + return javaToUpperCase(); } - } else { - return trimLeftSlow(trimStr); + bytes[i] = (byte) upper; + segmentAndOffset.nextByte(size); } + return fromBytes(bytes); } - private BinaryString trimLeftSlow(BinaryString trimStr) { - int searchIdx = 0; - int segSize = segments[0].size(); - SegmentAndOffset front = firstSegmentAndOffset(segSize); - while (searchIdx < this.sizeInBytes) { - int charBytes = numBytesForFirstByte(front.value()); - BinaryString currentChar = copyBinaryString(searchIdx, searchIdx + charBytes - 1); - if (trimStr.contains(currentChar)) { - searchIdx += charBytes; - front.skipBytes(charBytes, segSize); - } else { - break; - } + private BinaryString javaToUpperCase() { + return fromString(toString().toUpperCase()); + } + + /** + * Converts all of the characters in this {@code BinaryString} to lower case. + * + * @return the {@code BinaryString}, converted to lowercase. + */ + public BinaryString toLowerCase() { + if (javaObject != null) { + return javaToLowerCase(); } - if (searchIdx == this.sizeInBytes) { - // empty string + if (sizeInBytes == 0) { return EMPTY_UTF8; - } else { - return copyBinaryString(searchIdx, this.sizeInBytes - 1); } - } - - public BinaryString trimRight() { - ensureMaterialized(); - if (inFirstSegment()) { - int e = sizeInBytes - 1; - // skip all of the space (0x20) in the right side - while (e >= 0 && getByteOneSegment(e) == 0x20) { - e--; - } - - if (e < 0) { - // empty string - return EMPTY_UTF8; - } else { - return copyBinaryStringInOneSeg(0, e); - } - } else { - return trimRightSlow(); - } - } - - private BinaryString trimRightSlow() { - int e = sizeInBytes - 1; - int segSize = segments[0].size(); - SegmentAndOffset behind = lastSegmentAndOffset(segSize); - // skip all of the space (0x20) in the right side - while (e >= 0 && behind.value() == 0x20) { - e--; - behind.previousByte(segSize); - } - - if (e < 0) { - // empty string - return EMPTY_UTF8; - } else { - return copyBinaryString(0, e); - } - } - - /** - * Walk each character of current string from right end, remove the character if it - * is in trim string. Stops at the first character which is not in trim string. - * Return the new substring. - * - * @param trimStr the trim string - * @return A subString which removes all of the character from the right side that is in - * trim string. - */ - public BinaryString trimRight(BinaryString trimStr) { - ensureMaterialized(); - if (trimStr == null) { - return null; - } - trimStr.ensureMaterialized(); - if (trimStr.isSpaceString()) { - return trimRight(); - } - if (inFirstSegment()) { - int charIdx = 0; - int byteIdx = 0; - // each element in charLens is length of character in the source string - int[] charLens = new int[sizeInBytes]; - // each element in charStartPos is start position of first byte in the source string - int[] charStartPos = new int[sizeInBytes]; - while (byteIdx < sizeInBytes) { - charStartPos[charIdx] = byteIdx; - charLens[charIdx] = numBytesForFirstByte(getByteOneSegment(byteIdx)); - byteIdx += charLens[charIdx]; - charIdx++; - } - // searchIdx points to the first character which is not in trim string from the right - // end. - int searchIdx = sizeInBytes - 1; - charIdx -= 1; - while (charIdx >= 0) { - BinaryString currentChar = copyBinaryStringInOneSeg( - charStartPos[charIdx], - charStartPos[charIdx] + charLens[charIdx] - 1); - if (trimStr.contains(currentChar)) { - searchIdx -= charLens[charIdx]; - } else { - break; - } - charIdx--; - } - if (searchIdx < 0) { - // empty string - return EMPTY_UTF8; - } else { - return copyBinaryStringInOneSeg(0, searchIdx); + int size = segments[0].size(); + SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); + byte[] bytes = new byte[sizeInBytes]; + bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value()); + for (int i = 0; i < sizeInBytes; i++) { + byte b = segmentAndOffset.value(); + if (numBytesForFirstByte(b) != 1) { + // fallback + return javaToLowerCase(); } - } else { - return trimRightSlow(trimStr); - } - } - - private BinaryString trimRightSlow(BinaryString trimStr) { - int charIdx = 0; - int byteIdx = 0; - int segSize = segments[0].size(); - SegmentAndOffset index = firstSegmentAndOffset(segSize); - // each element in charLens is length of character in the source string - int[] charLens = new int[sizeInBytes]; - // each element in charStartPos is start position of first byte in the source string - int[] charStartPos = new int[sizeInBytes]; - while (byteIdx < sizeInBytes) { - charStartPos[charIdx] = byteIdx; - int charBytes = numBytesForFirstByte(index.value()); - charLens[charIdx] = charBytes; - byteIdx += charBytes; - charIdx++; - index.skipBytes(charBytes, segSize); - } - // searchIdx points to the first character which is not in trim string from the right - // end. - int searchIdx = sizeInBytes - 1; - charIdx -= 1; - while (charIdx >= 0) { - BinaryString currentChar = copyBinaryString( - charStartPos[charIdx], - charStartPos[charIdx] + charLens[charIdx] - 1); - if (trimStr.contains(currentChar)) { - searchIdx -= charLens[charIdx]; - } else { - break; + int lower = Character.toLowerCase((int) b); + if (lower > 127) { + // fallback + return javaToLowerCase(); } - charIdx--; - } - if (searchIdx < 0) { - // empty string - return EMPTY_UTF8; - } else { - return copyBinaryString(0, searchIdx); + bytes[i] = (byte) lower; + segmentAndOffset.nextByte(size); } + return fromBytes(bytes); } - public BinaryString trim(boolean leading, boolean trailing, BinaryString seek) { - ensureMaterialized(); - if (seek == null) { - return null; - } - if (leading && trailing) { - return trim(seek); - } else if (leading) { - return trimLeft(seek); - } else if (trailing) { - return trimRight(seek); - } else { - return this; - } + private BinaryString javaToLowerCase() { + return fromString(toString().toLowerCase()); } - /** - * Parse target string as key-value string and - * return the value matches key name. - * If accept any null arguments, return null. - * example: - * keyvalue('k1=v1;k2=v2', ';', '=', 'k2') = 'v2' - * keyvalue('k1:v1,k2:v2', ',', ':', 'k3') = NULL - * - * @param split1 separator between key-value tuple. - * @param split2 separator between key and value. - * @param keyName name of the key whose value you want return. - * - * @return target value. - */ - public BinaryString keyValue(byte split1, byte split2, BinaryString keyName) { - ensureMaterialized(); - if (keyName == null || keyName.getSizeInBytes() == 0) { - return null; - } - if (inFirstSegment() && keyName.inFirstSegment()) { - // position in byte - int byteIdx = 0; - // position of last split1 - int lastSplit1Idx = -1; - while (byteIdx < sizeInBytes) { - // If find next split1 in str, process current kv - if (segments[0].get(offset + byteIdx) == split1) { - int currentKeyIdx = lastSplit1Idx + 1; - // If key of current kv is keyName, return the value directly - BinaryString value = findValueOfKey(split2, keyName, currentKeyIdx, byteIdx); - if (value != null) { - return value; - } - lastSplit1Idx = byteIdx; - } - byteIdx++; - } - // process the string which is not ends with split1 - int currentKeyIdx = lastSplit1Idx + 1; - return findValueOfKey(split2, keyName, currentKeyIdx, sizeInBytes); - } else { - return keyValueSlow(split1, split2, keyName); - } - } + // ------------------------------------------------------------------------------------------ + // Internal methods on BinaryString + // ------------------------------------------------------------------------------------------ - private BinaryString findValueOfKey( - byte split, - BinaryString keyName, - int start, - int end) { - int keyNameLen = keyName.sizeInBytes; - for (int idx = start; idx < end; idx++) { - if (segments[0].get(offset + idx) == split) { - if (idx == start + keyNameLen && - segments[0].equalTo(keyName.segments[0], offset + start, - keyName.offset, keyNameLen)) { - int valueIdx = idx + 1; - int valueLen = end - valueIdx; - byte[] bytes = new byte[valueLen]; - segments[0].get(offset + valueIdx, bytes, 0, valueLen); - return fromBytes(bytes, 0, valueLen); - } else { - return null; - } - } - } - return null; + byte getByteOneSegment(int i) { Review comment: private? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services