This is an automated email from the ASF dual-hosted git repository. dockerzhang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/inlong.git
The following commit(s) were added to refs/heads/master by this push: new e50f45ba8c [INLONG-10768][Sort] Csv utils support specified the max split field size (#10769) e50f45ba8c is described below commit e50f45ba8c40f7d61ea0079657cc683f2619c8c3 Author: vernedeng <verned...@apache.org> AuthorDate: Sun Aug 11 15:44:58 2024 +0800 [INLONG-10768][Sort] Csv utils support specified the max split field size (#10769) --- .../inlong/sort/formats/util/StringUtils.java | 69 +++++++++++++++++++++- .../sort/formats/common/StringUtilsTest.java | 41 +++++++++++++ 2 files changed, 108 insertions(+), 2 deletions(-) diff --git a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java index f33ad8e825..3ea6678ca1 100644 --- a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java +++ b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java @@ -387,6 +387,19 @@ public class StringUtils { return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter, false); } + /** + * @see StringUtils#splitCsv(String, Character, Character, Character, Character, boolean, Integer) + */ + public static String[][] splitCsv( + @Nonnull String text, + @Nonnull Character delimiter, + @Nullable Character escapeChar, + @Nullable Character quoteChar, + @Nullable Character lineDelimiter, + boolean deleteHeadDelimiter) { + return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter, deleteHeadDelimiter, null); + } + /** * Splits the csv text, which may contains multiple lines of data. * @@ -402,6 +415,7 @@ public class StringUtils { * @param lineDelimiter The delimiter between lines, e.g. '\n'. * @param deleteHeadDelimiter If true and the leading character of a line * is a delimiter, it will be ignored. + * @param maxFieldSize The max filed size of one single line * @return A 2-D String array representing the parsed data, where the 1st * dimension is row and the 2nd dimension is column. */ @@ -411,9 +425,16 @@ public class StringUtils { @Nullable Character escapeChar, @Nullable Character quoteChar, @Nullable Character lineDelimiter, - boolean deleteHeadDelimiter) { + boolean deleteHeadDelimiter, + @Nullable Integer maxFieldSize) { + if (maxFieldSize != null && maxFieldSize <= 0) { + return new String[0][]; + } + List<String[]> lines = new ArrayList<>(); List<String> fields = new ArrayList<>(); + int splittedSize = 0; + int lastFieldStartIndex = 0; StringBuilder stringBuilder = new StringBuilder(); int state = STATE_NORMAL; @@ -431,6 +452,14 @@ public class StringUtils { String field = stringBuilder.toString(); fields.add(field); stringBuilder.setLength(0); + + splittedSize++; + // if the last field, mark the last filed start index + if (maxFieldSize != null && splittedSize == maxFieldSize - 1) { + if (i + 1 < text.length()) { + lastFieldStartIndex = i + 1; + } + } break; case STATE_ESCAPING: stringBuilder.append(ch); @@ -471,10 +500,19 @@ public class StringUtils { case STATE_NORMAL: String field = stringBuilder.toString(); fields.add(field); - lines.add(fields.toArray(new String[0])); + // if the max field size < the real field size, + // remove the extra fields and copy the latest field from lastFieldStartIndex to current index + if (maxFieldSize != null && fields.size() > maxFieldSize) { + fields = replaceLastField(fields, maxFieldSize, text, lastFieldStartIndex, i); + } + // reset the lastFieldStartIndex for new line + lastFieldStartIndex = i + 1; + + lines.add(fields.toArray(new String[0])); stringBuilder.setLength(0); fields.clear(); + splittedSize = 0; break; case STATE_ESCAPING: stringBuilder.append(ch); @@ -498,6 +536,11 @@ public class StringUtils { case STATE_QUOTING: String field = stringBuilder.toString(); fields.add(field); + + if (maxFieldSize != null && fields.size() > maxFieldSize) { + fields = replaceLastField(fields, maxFieldSize, text, lastFieldStartIndex, text.length()); + } + lines.add(fields.toArray(new String[0])); String[][] result = new String[lines.size()][]; @@ -510,6 +553,28 @@ public class StringUtils { } } + /** + * if the max field size < the real field size, + * remove the extra fields and copy the latest field from lastFieldStartIndex to lastFieldEndIndex + * + * @param fields Target field list + * @param maxFieldSize Specified max fieldSize + * @param text Origin text + * @param lastFieldStartIndex Start index of last field + * @param lastFieldEndIndex End index of last field + */ + private static List<String> replaceLastField( + List<String> fields, + int maxFieldSize, + String text, + int lastFieldStartIndex, + int lastFieldEndIndex) { + List<String> newField = fields.subList(0, maxFieldSize - 1); + String last = text.substring(lastFieldStartIndex, lastFieldEndIndex); + newField.add(last); + return newField; + } + /** * Concat the given fields. * diff --git a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java index 714652664e..fc64811a97 100644 --- a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java +++ b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java @@ -112,4 +112,45 @@ public class StringUtilsTest { assertEquals("home", csv1Array2[2][1]); assertEquals("home", csv1Array2[2][2]); } + + @Test + public void testSplitCsvStringWithMaxFields() { + + String csvString = "name|age=20\\||&'\n\name|age=20\\||&'\n\n|home|\\home\\"; + String[][] csv1Array0 = StringUtils.splitCsv(csvString, '|', + '\\', '\'', '\n', false, 0); + assertEquals(0, csv1Array0.length); + + String[][] csv1Array1 = StringUtils.splitCsv(csvString, '|', + '\\', '\'', '\n', false, 1); + assertEquals("name|age=20\\||&'\n\name|age=20\\||&'", csv1Array1[0][0]); + assertEquals("", csv1Array1[1][0]); + assertEquals("|home|\\home\\", csv1Array1[2][0]); + + String[][] csv1Array2 = StringUtils.splitCsv(csvString, '|', + '\\', '\'', '\n', false, 2); + assertEquals("name", csv1Array2[0][0]); + assertEquals("age=20\\||&'\n\name|age=20\\||&'", csv1Array2[0][1]); + assertEquals("", csv1Array2[1][0]); + assertEquals("", csv1Array2[2][0]); + assertEquals("home|\\home\\", csv1Array2[2][1]); + + String[][] csv1Array3 = StringUtils.splitCsv(csvString, '|', + '\\', '\'', '\n', false, 3); + assertEquals("name", csv1Array3[0][0]); + assertEquals("age=20|", csv1Array3[0][1]); + assertEquals("&\n\name|age=20\\||&", csv1Array3[0][2]); + assertEquals("", csv1Array3[2][0]); + assertEquals("home", csv1Array3[2][1]); + assertEquals("home", csv1Array3[2][2]); + + String[][] csv1Array4 = StringUtils.splitCsv(csvString, '|', + '\\', '\'', '\n', false, 4); + assertEquals("name", csv1Array4[0][0]); + assertEquals("age=20|", csv1Array4[0][1]); + assertEquals("&\n\name|age=20\\||&", csv1Array4[0][2]); + assertEquals("", csv1Array4[2][0]); + assertEquals("home", csv1Array4[2][1]); + assertEquals("home", csv1Array4[2][2]); + } }