KurtYoung commented on a change in pull request #8689: [FLINK-12802][table-runtime-blink] Reducing the Code of BinaryString URL: https://github.com/apache/flink/pull/8689#discussion_r293718664
########## File path: flink-table/flink-table-runtime-blink/src/main/java/org/apache/flink/table/dataformat/BinaryString.java ########## @@ -1242,599 +763,32 @@ private void skipBytes(int n, int segSize) { } } - private byte value() { + byte value() { return this.segment.get(this.offset); } } /** - * Parses this BinaryString to Long. - * - * <p>Note that, in this method we accumulate the result in negative format, and convert it to - * positive format at the end, if this string is not started with '-'. This is because min value - * is bigger than max value in digits, e.g. Long.MAX_VALUE is '9223372036854775807' and - * Long.MIN_VALUE is '-9223372036854775808'. - * - * <p>This code is mostly copied from LazyLong.parseLong in Hive. - * @return Long value if the parsing was successful else null. - */ - public Long toLong() { - ensureMaterialized(); - if (sizeInBytes == 0) { - return null; - } - int size = segments[0].size(); - SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); - int totalOffset = 0; - - byte b = segmentAndOffset.value(); - final boolean negative = b == '-'; - if (negative || b == '+') { - segmentAndOffset.nextByte(size); - totalOffset++; - if (sizeInBytes == 1) { - return null; - } - } - - long result = 0; - final byte separator = '.'; - final int radix = 10; - final long stopValue = Long.MIN_VALUE / radix; - while (totalOffset < this.sizeInBytes) { - b = segmentAndOffset.value(); - totalOffset++; - segmentAndOffset.nextByte(size); - if (b == separator) { - // We allow decimals and will return a truncated integral in that case. - // Therefore we won't throw an exception here (checking the fractional - // part happens below.) - break; - } - - int digit; - if (b >= '0' && b <= '9') { - digit = b - '0'; - } else { - return null; - } - - // We are going to process the new digit and accumulate the result. However, before - // doing this, if the result is already smaller than the - // stopValue(Long.MIN_VALUE / radix), then result * 10 will definitely be smaller - // than minValue, and we can stop. - if (result < stopValue) { - return null; - } - - result = result * radix - digit; - // Since the previous result is less than or equal to - // stopValue(Long.MIN_VALUE / radix), we can just use `result > 0` to check overflow. - // If result overflows, we should stop. - if (result > 0) { - return null; - } - } - - // This is the case when we've encountered a decimal separator. The fractional - // part will not change the number, but we will verify that the fractional part - // is well formed. - while (totalOffset < sizeInBytes) { - byte currentByte = segmentAndOffset.value(); - if (currentByte < '0' || currentByte > '9') { - return null; - } - totalOffset++; - segmentAndOffset.nextByte(size); - } - - if (!negative) { - result = -result; - if (result < 0) { - return null; - } - } - return result; - } - - /** - * Parses this BinaryString to Int. - * - * <p>Note that, in this method we accumulate the result in negative format, and convert it to - * positive format at the end, if this string is not started with '-'. This is because min value - * is bigger than max value in digits, e.g. Integer.MAX_VALUE is '2147483647' and - * Integer.MIN_VALUE is '-2147483648'. - * - * <p>This code is mostly copied from LazyInt.parseInt in Hive. - * - * <p>Note that, this method is almost same as `toLong`, but we leave it duplicated for performance - * reasons, like Hive does. - * @return Integer value if the parsing was successful else null. - */ - public Integer toInt() { - ensureMaterialized(); - if (sizeInBytes == 0) { - return null; - } - int size = segments[0].size(); - SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); - int totalOffset = 0; - - byte b = segmentAndOffset.value(); - final boolean negative = b == '-'; - if (negative || b == '+') { - segmentAndOffset.nextByte(size); - totalOffset++; - if (sizeInBytes == 1) { - return null; - } - } - - int result = 0; - final byte separator = '.'; - final int radix = 10; - final long stopValue = Integer.MIN_VALUE / radix; - while (totalOffset < this.sizeInBytes) { - b = segmentAndOffset.value(); - totalOffset++; - segmentAndOffset.nextByte(size); - if (b == separator) { - // We allow decimals and will return a truncated integral in that case. - // Therefore we won't throw an exception here (checking the fractional - // part happens below.) - break; - } - - int digit; - if (b >= '0' && b <= '9') { - digit = b - '0'; - } else { - return null; - } - - // We are going to process the new digit and accumulate the result. However, before - // doing this, if the result is already smaller than the - // stopValue(Long.MIN_VALUE / radix), then result * 10 will definitely be smaller - // than minValue, and we can stop. - if (result < stopValue) { - return null; - } - - result = result * radix - digit; - // Since the previous result is less than or equal to - // stopValue(Long.MIN_VALUE / radix), we can just use `result > 0` to check overflow. - // If result overflows, we should stop. - if (result > 0) { - return null; - } - } - - // This is the case when we've encountered a decimal separator. The fractional - // part will not change the number, but we will verify that the fractional part - // is well formed. - while (totalOffset < sizeInBytes) { - byte currentByte = segmentAndOffset.value(); - if (currentByte < '0' || currentByte > '9') { - return null; - } - totalOffset++; - segmentAndOffset.nextByte(size); - } - - if (!negative) { - result = -result; - if (result < 0) { - return null; - } - } - return result; - } - - public Short toShort() { - Integer intValue = toInt(); - if (intValue != null) { - short result = intValue.shortValue(); - if (result == intValue) { - return result; - } - } - return null; - } - - public Byte toByte() { - Integer intValue = toInt(); - if (intValue != null) { - byte result = intValue.byteValue(); - if (result == intValue) { - return result; - } - } - return null; - } - - public Double toDouble() { - try { - return Double.valueOf(toString()); - } catch (NumberFormatException e) { - return null; - } - } - - public Float toFloat() { - try { - return Float.valueOf(toString()); - } catch (NumberFormatException e) { - return null; - } - } - - /** - * Parses this BinaryString to Decimal. - * - * @return Decimal value if the parsing was successful, or null if overflow - * @throws NumberFormatException if the parsing failed. - */ - public Decimal toDecimal(int precision, int scale) { - ensureMaterialized(); - if (precision > Decimal.MAX_LONG_DIGITS || this.sizeInBytes > Decimal.MAX_LONG_DIGITS) { - return toDecimalSlow(precision, scale); - } - - // Data in Decimal is stored by one long value if `precision` <= Decimal.MAX_LONG_DIGITS. - // In this case we can directly extract the value from memory segment. - int size = getSegments()[0].size(); - SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); - int totalOffset = 0; - - // Remove white spaces at the beginning - byte b = 0; - while (totalOffset < this.sizeInBytes) { - b = segmentAndOffset.value(); - if (b != ' ' && b != '\n' && b != '\t') { - break; - } - totalOffset++; - segmentAndOffset.nextByte(size); - } - if (totalOffset == this.sizeInBytes) { - // all whitespaces - return null; - } - - // ======= Significand part begin ======= - final boolean negative = b == '-'; - if (negative || b == '+') { - segmentAndOffset.nextByte(size); - totalOffset++; - if (totalOffset == this.sizeInBytes) { - // only contains prefix plus/minus - return null; - } - } - - long significand = 0; - int exp = 0; - int significandLen = 0, pointPos = -1; - - while (totalOffset < this.sizeInBytes) { - b = segmentAndOffset.value(); - totalOffset++; - segmentAndOffset.nextByte(size); - - if (b >= '0' && b <= '9') { - // No need to worry about overflow, because this.sizeInBytes <= Decimal.MAX_LONG_DIGITS - significand = significand * 10 + (b - '0'); - significandLen++; - } else if (b == '.') { - if (pointPos >= 0) { - // More than one decimal point - return null; - } - pointPos = significandLen; - } else { - break; - } - } - - if (pointPos < 0) { - pointPos = significandLen; - } - if (negative) { - significand = -significand; - } - // ======= Significand part end ======= - - // ======= Exponential part begin ======= - if ((b == 'e' || b == 'E') && totalOffset < this.sizeInBytes) { - b = segmentAndOffset.value(); - final boolean expNegative = b == '-'; - if (expNegative || b == '+') { - segmentAndOffset.nextByte(size); - totalOffset++; - if (totalOffset == this.sizeInBytes) { - return null; - } - } - - int expDigits = 0; - // As `precision` <= 18, value absolute range is limited to 10^-18 ~ 10^18. - // The worst case is <18-digits>E-36 - final int expStopValue = 40; - - while (totalOffset < this.sizeInBytes) { - b = segmentAndOffset.value(); - totalOffset++; - segmentAndOffset.nextByte(size); - - if (b >= '0' && b <= '9') { - // No need to worry about larger exponents, - // because they will produce overflow or underflow - if (expDigits < expStopValue) { - expDigits = expDigits * 10 + (b - '0'); - } - } else { - break; - } - } - - if (expNegative) { - expDigits = -expDigits; - } - exp += expDigits; - } - exp -= significandLen - pointPos; - // ======= Exponential part end ======= - - // Check for invalid character at the end - while (totalOffset < this.sizeInBytes) { - b = segmentAndOffset.value(); - totalOffset++; - segmentAndOffset.nextByte(size); - // White spaces are allowed at the end - if (b != ' ' && b != '\n' && b != '\t') { - return null; - } - } - - // Round exp to scale - int change = exp + scale; - if (significandLen + change > precision) { - // Overflow - return null; - } - if (change >= 0) { - significand *= Decimal.POW10[change]; - } else { - int k = negative ? -5 : 5; - significand = (significand + k * Decimal.POW10[-change - 1]) / Decimal.POW10[-change]; - } - return Decimal.fromLong(significand, precision, scale); - } - - private Decimal toDecimalSlow(int precision, int scale) { - // As data in Decimal is currently stored by BigDecimal if `precision` > Decimal.MAX_LONG_DIGITS, - // and BigDecimal only supports String or char[] for its constructor, - // we can't directly extract the value from BinaryString. - // - // As BigDecimal(char[], int, int) is faster than BigDecimal(String, int, int), - // we extract char[] from the memory segment and pass it to the constructor of BigDecimal. - char[] chars = SegmentsUtil.allocateReuseChars(sizeInBytes); - int len; - if (segments.length == 1) { - len = StringUtf8Utils.decodeUTF8Strict(segments[0], offset, sizeInBytes, chars); - } else { - byte[] bytes = SegmentsUtil.allocateReuseBytes(sizeInBytes); - ensureMaterialized(); - SegmentsUtil.copyToBytes(segments, offset, bytes, 0, sizeInBytes); - len = StringUtf8Utils.decodeUTF8Strict(bytes, 0, sizeInBytes, chars); - } - - if (len < 0) { - return null; - } else { - // Trim white spaces - int start = 0, end = len; - for (int i = 0; i < len; i++) { - if (chars[i] != ' ' && chars[i] != '\n' && chars[i] != '\t') { - start = i; - break; - } - } - for (int i = len - 1; i >= 0; i--) { - if (chars[i] != ' ' && chars[i] != '\n' && chars[i] != '\t') { - end = i + 1; - break; - } - } - try { - BigDecimal bd = new BigDecimal(chars, start, end - start); - return Decimal.fromBigDecimal(bd, precision, scale); - } catch (NumberFormatException nfe) { - return null; - } - } - } - - /** - * Returns the upper case of this string. - */ - public BinaryString toUpperCase() { - if (javaObject != null) { - return toUpperCaseSlow(); - } - if (sizeInBytes == 0) { - return EMPTY_UTF8; - } - int size = segments[0].size(); - SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); - byte[] bytes = new byte[sizeInBytes]; - bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value()); - for (int i = 0; i < sizeInBytes; i++) { - byte b = segmentAndOffset.value(); - if (numBytesForFirstByte(b) != 1) { - // fallback - return toUpperCaseSlow(); - } - int upper = Character.toUpperCase((int) b); - if (upper > 127) { - // fallback - return toUpperCaseSlow(); - } - bytes[i] = (byte) upper; - segmentAndOffset.nextByte(size); - } - return fromBytes(bytes); - } - - private BinaryString toUpperCaseSlow() { - return fromString(toString().toUpperCase()); - } - - /** - * Returns the lower case of this string. - */ - public BinaryString toLowerCase() { - if (javaObject != null) { - return toLowerCaseSlow(); - } - if (sizeInBytes == 0) { - return EMPTY_UTF8; - } - int size = segments[0].size(); - SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); - byte[] bytes = new byte[sizeInBytes]; - bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value()); - for (int i = 0; i < sizeInBytes; i++) { - byte b = segmentAndOffset.value(); - if (numBytesForFirstByte(b) != 1) { - // fallback - return toLowerCaseSlow(); - } - int lower = Character.toLowerCase((int) b); - if (lower > 127) { - // fallback - return toLowerCaseSlow(); - } - bytes[i] = (byte) lower; - segmentAndOffset.nextByte(size); - } - return fromBytes(bytes); - } - - private BinaryString toLowerCaseSlow() { - return fromString(toString().toLowerCase()); - } - - /** - * <p>Splits the provided text into an array, separator string specified. </p> - * - * <p>The separator is not included in the returned String array. - * Adjacent separators are treated as separators for empty tokens.</p> - * - * <p>A {@code null} separator splits on whitespace.</p> - * - * <pre> - * "".splitByWholeSeparatorPreserveAllTokens(*) = [] - * "ab de fg".splitByWholeSeparatorPreserveAllTokens(null) = ["ab", "de", "fg"] - * "ab de fg".splitByWholeSeparatorPreserveAllTokens(null) = ["ab", "", "", "de", "fg"] - * "ab:cd:ef".splitByWholeSeparatorPreserveAllTokens(":") = ["ab", "cd", "ef"] - * "ab-!-cd-!-ef".splitByWholeSeparatorPreserveAllTokens("-!-") = ["ab", "cd", "ef"] - * </pre> - * - * <p>Note: return BinaryStrings is reuse MemorySegments from this.</p> - * - * @param separator String containing the String to be used as a delimiter, - * {@code null} splits on whitespace - * @return an array of parsed Strings, {@code null} if null String was input - * @since 2.4 - */ - public BinaryString[] splitByWholeSeparatorPreserveAllTokens(BinaryString separator) { - ensureMaterialized(); - final int len = sizeInBytes; - - if (len == 0) { - return EMPTY_STRING_ARRAY; - } - - if (separator == null || EMPTY_UTF8.equals(separator)) { - // Split on whitespace. - return splitByWholeSeparatorPreserveAllTokens(fromString(" ")); - } - separator.ensureMaterialized(); - - final int separatorLength = separator.sizeInBytes; - - final ArrayList<BinaryString> substrings = new ArrayList<>(); - int beg = 0; - int end = 0; - while (end < len) { - end = SegmentsUtil.find( - segments, offset + beg, sizeInBytes - beg, - separator.segments, separator.offset, separator.sizeInBytes) - offset; - - if (end > -1) { - if (end > beg) { - - // The following is OK, because String.substring( beg, end ) excludes - // the character at the position 'end'. - substrings.add(BinaryString.fromAddress(segments, offset + beg, end - beg)); - - // Set the starting point for the next search. - // The following is equivalent to beg = end + (separatorLength - 1) + 1, - // which is the right calculation: - beg = end + separatorLength; - } else { - // We found a consecutive occurrence of the separator. - substrings.add(EMPTY_UTF8); - beg = end + separatorLength; - } - } else { - // String.substring( beg ) goes from 'beg' to the end of the String. - substrings.add(BinaryString.fromAddress(segments, offset + beg, sizeInBytes - beg)); - end = len; - } - } - - return substrings.toArray(new BinaryString[0]); - } - - /** - * Calculate the hash value of a given string use {@link MessageDigest}. - */ - public BinaryString hash(MessageDigest md) { - String str = EncodingUtils.hex(md.digest(getBytes())); - return fromString(str); - } - - public BinaryString hash(String algorithm) throws NoSuchAlgorithmException { - return hash(MessageDigest.getInstance(algorithm)); - } - - private static final List<BinaryString> TRUE_STRINGS = - Stream - .of("t", "true", "y", "yes", "1") - .map(BinaryString::fromString) - .peek(BinaryString::ensureMaterialized) - .collect(Collectors.toList()); - - private static final List<BinaryString> FALSE_STRINGS = - Stream - .of("f", "false", "n", "no", "0") - .map(BinaryString::fromString) - .peek(BinaryString::ensureMaterialized) - .collect(Collectors.toList()); - - /** - * Decide boolean representation of a string. + * Returns the number of bytes for a code point with the first byte as `b`. + * @param b The first byte of a code point */ - public Boolean toBooleanSQL() { - if (TRUE_STRINGS.contains(toLowerCase())) { - return true; - } else if (FALSE_STRINGS.contains(toLowerCase())) { - return false; + static int numBytesForFirstByte(final byte b) { Review comment: move this to `BinaryStringUtil`? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services