github-actions[bot] commented on code in PR #64009:
URL: https://github.com/apache/doris/pull/64009#discussion_r3451726977
##########
fe/fe-filesystem/fe-filesystem-spi/src/main/java/org/apache/doris/filesystem/spi/S3CompatibleFileSystem.java:
##########
@@ -907,6 +922,227 @@ private static String expandNumericRanges(String pattern)
{
return sb.toString();
}
+ private static boolean isDeterministicKeyPattern(String pattern) {
+ if (pattern.indexOf('*') >= 0 || pattern.indexOf('?') >= 0 ||
pattern.indexOf('\\') >= 0) {
+ return false;
+ }
+ int i = 0;
+ while (i < pattern.length()) {
+ if (pattern.charAt(i) != '[') {
+ i++;
+ continue;
+ }
+ int end = pattern.indexOf(']', i + 1);
+ if (end < 0 || end == i + 1) {
+ return false;
+ }
+ char first = pattern.charAt(i + 1);
+ if (first == '!' || first == '^') {
+ return false;
+ }
+ i = end + 1;
+ }
+ return true;
+ }
+
+ private static List<Character> expandBracketContent(String content) {
+ List<Character> values = new ArrayList<>();
+ int i = 0;
+ while (i < content.length()) {
+ if (i + 2 < content.length() && content.charAt(i + 1) == '-') {
+ char from = content.charAt(i);
+ char to = content.charAt(i + 2);
+ int step = from <= to ? 1 : -1;
+ for (char c = from; step > 0 ? c <= to : c >= to; c += step) {
+ if (!values.contains(c)) {
+ values.add(c);
+ }
+ }
+ i += 3;
+ } else {
+ char c = content.charAt(i);
+ if (!values.contains(c)) {
+ values.add(c);
+ }
+ i++;
+ }
+ }
+ return values;
+ }
+
+ private static List<String> expandDeterministicKeyPattern(String pattern,
int maxPaths) {
+ if (!isDeterministicKeyPattern(pattern)) {
+ return null;
+ }
+ List<String> expanded = new ArrayList<>();
+ int expansionLimit = maxPaths == Integer.MAX_VALUE ? Integer.MAX_VALUE
: maxPaths + 1;
+ expandDeterministicKeyPatternBounded(pattern, expanded,
expansionLimit);
+ if (expanded.size() > maxPaths) {
+ return null;
+ }
+ List<String> keys = new ArrayList<>(new LinkedHashSet<>(expanded));
+ Collections.sort(keys);
+ return keys;
+ }
+
+ private static void expandDeterministicKeyPatternBounded(String pattern,
List<String> result, int maxPaths) {
+ if (result.size() >= maxPaths) {
+ return;
+ }
+ int bracketStart = pattern.indexOf('[');
+ int braceStart = pattern.indexOf('{');
+ if (bracketStart < 0 && braceStart < 0) {
+ result.add(pattern);
+ return;
+ }
+ if (bracketStart >= 0 && (braceStart < 0 || bracketStart <
braceStart)) {
+ int bracketEnd = pattern.indexOf(']', bracketStart + 1);
+ String prefix = pattern.substring(0, bracketStart);
+ String suffix = pattern.substring(bracketEnd + 1);
+ for (char alternative :
expandBracketContent(pattern.substring(bracketStart + 1, bracketEnd))) {
+ if (result.size() >= maxPaths) {
+ return;
+ }
+ expandDeterministicKeyPatternBounded(prefix + alternative +
suffix, result, maxPaths);
+ }
+ return;
+ }
+ int braceEnd = findMatchingBrace(pattern, braceStart);
+ if (braceEnd < 0) {
+ result.add(pattern);
+ return;
+ }
+ String prefix = pattern.substring(0, braceStart);
+ String suffix = pattern.substring(braceEnd + 1);
+ for (String alternative :
expandBraceAlternatives(pattern.substring(braceStart + 1, braceEnd))) {
+ if (result.size() >= maxPaths) {
+ return;
+ }
+ expandDeterministicKeyPatternBounded(prefix + alternative +
suffix, result, maxPaths);
+ }
+ }
+
+ private static List<String> expandBraceAlternatives(String content) {
+ List<String> alternatives = splitBraceContent(content);
+ boolean mixed = alternatives.size() > 1;
+ LinkedHashSet<String> expanded = new LinkedHashSet<>();
+ for (String alternative : alternatives) {
+ expanded.addAll(expandNumericRangeAlternative(alternative, mixed));
+ }
+ return new ArrayList<>(expanded);
+ }
+
+ private static List<String> expandNumericRangeAlternative(String
alternative, boolean mixed) {
+ java.util.regex.Pattern rangeSegment = java.util.regex.Pattern.compile(
+ "(-?\\d+)\\.\\.(-?\\d+)");
+ java.util.regex.Matcher matcher =
rangeSegment.matcher(alternative.trim());
Review Comment:
The deterministic fast path is trimming literal brace alternatives, which
changes the set of S3 keys that are probed. For example, `file{ a,b }.csv` is a
valid glob whose list/regex path preserves spaces and matches `file a.csv` and
`fileb .csv`; `globToRegex` only treats the comma as alternation and leaves
spaces literal. This new path reaches `expandNumericRangeAlternative`, and the
non-range branch returns `alternative.trim()`, so it HEADs `filea.csv` and
`fileb.csv` instead. Since S3 keys can contain spaces, this silently returns
the wrong files for valid TVF patterns. Please keep the original alternative
text for non-range alternatives, and only trim a temporary value when
detecting/parsing numeric ranges.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]