This is an automated email from the ASF dual-hosted git repository.

dockerzhang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/inlong.git


The following commit(s) were added to refs/heads/master by this push:
     new e50f45ba8c [INLONG-10768][Sort] Csv utils support specified the max 
split field size (#10769)
e50f45ba8c is described below

commit e50f45ba8c40f7d61ea0079657cc683f2619c8c3
Author: vernedeng <verned...@apache.org>
AuthorDate: Sun Aug 11 15:44:58 2024 +0800

    [INLONG-10768][Sort] Csv utils support specified the max split field size 
(#10769)
---
 .../inlong/sort/formats/util/StringUtils.java      | 69 +++++++++++++++++++++-
 .../sort/formats/common/StringUtilsTest.java       | 41 +++++++++++++
 2 files changed, 108 insertions(+), 2 deletions(-)

diff --git 
a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
 
b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
index f33ad8e825..3ea6678ca1 100644
--- 
a/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
+++ 
b/inlong-sort/sort-formats/format-common/src/main/java/org/apache/inlong/sort/formats/util/StringUtils.java
@@ -387,6 +387,19 @@ public class StringUtils {
         return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter, 
false);
     }
 
+    /**
+     * @see StringUtils#splitCsv(String, Character, Character, Character, 
Character, boolean, Integer)
+     */
+    public static String[][] splitCsv(
+            @Nonnull String text,
+            @Nonnull Character delimiter,
+            @Nullable Character escapeChar,
+            @Nullable Character quoteChar,
+            @Nullable Character lineDelimiter,
+            boolean deleteHeadDelimiter) {
+        return splitCsv(text, delimiter, escapeChar, quoteChar, lineDelimiter, 
deleteHeadDelimiter, null);
+    }
+
     /**
      * Splits the csv text, which may contains multiple lines of data.
      *
@@ -402,6 +415,7 @@ public class StringUtils {
      * @param lineDelimiter The delimiter between lines, e.g. '\n'.
      * @param deleteHeadDelimiter If true and the leading character of a line
      *                            is a delimiter, it will be ignored.
+     * @param maxFieldSize The max filed size of one single line
      * @return A 2-D String array representing the parsed data, where the 1st
      * dimension is row and the 2nd dimension is column.
      */
@@ -411,9 +425,16 @@ public class StringUtils {
             @Nullable Character escapeChar,
             @Nullable Character quoteChar,
             @Nullable Character lineDelimiter,
-            boolean deleteHeadDelimiter) {
+            boolean deleteHeadDelimiter,
+            @Nullable Integer maxFieldSize) {
+        if (maxFieldSize != null && maxFieldSize <= 0) {
+            return new String[0][];
+        }
+
         List<String[]> lines = new ArrayList<>();
         List<String> fields = new ArrayList<>();
+        int splittedSize = 0;
+        int lastFieldStartIndex = 0;
 
         StringBuilder stringBuilder = new StringBuilder();
         int state = STATE_NORMAL;
@@ -431,6 +452,14 @@ public class StringUtils {
                         String field = stringBuilder.toString();
                         fields.add(field);
                         stringBuilder.setLength(0);
+
+                        splittedSize++;
+                        // if the last field, mark the last filed start index
+                        if (maxFieldSize != null && splittedSize == 
maxFieldSize - 1) {
+                            if (i + 1 < text.length()) {
+                                lastFieldStartIndex = i + 1;
+                            }
+                        }
                         break;
                     case STATE_ESCAPING:
                         stringBuilder.append(ch);
@@ -471,10 +500,19 @@ public class StringUtils {
                     case STATE_NORMAL:
                         String field = stringBuilder.toString();
                         fields.add(field);
-                        lines.add(fields.toArray(new String[0]));
 
+                        // if the max field size < the real field size,
+                        // remove the extra fields and copy the latest field 
from lastFieldStartIndex to current index
+                        if (maxFieldSize != null && fields.size() > 
maxFieldSize) {
+                            fields = replaceLastField(fields, maxFieldSize, 
text, lastFieldStartIndex, i);
+                        }
+                        // reset the lastFieldStartIndex for new line
+                        lastFieldStartIndex = i + 1;
+
+                        lines.add(fields.toArray(new String[0]));
                         stringBuilder.setLength(0);
                         fields.clear();
+                        splittedSize = 0;
                         break;
                     case STATE_ESCAPING:
                         stringBuilder.append(ch);
@@ -498,6 +536,11 @@ public class StringUtils {
             case STATE_QUOTING:
                 String field = stringBuilder.toString();
                 fields.add(field);
+
+                if (maxFieldSize != null && fields.size() > maxFieldSize) {
+                    fields = replaceLastField(fields, maxFieldSize, text, 
lastFieldStartIndex, text.length());
+                }
+
                 lines.add(fields.toArray(new String[0]));
 
                 String[][] result = new String[lines.size()][];
@@ -510,6 +553,28 @@ public class StringUtils {
         }
     }
 
+    /**
+     * if the max field size < the real field size,
+     * remove the extra fields and copy the latest field from 
lastFieldStartIndex to lastFieldEndIndex
+     *
+     * @param fields Target field list
+     * @param maxFieldSize Specified max fieldSize
+     * @param text Origin text
+     * @param lastFieldStartIndex Start index of last field
+     * @param lastFieldEndIndex End index of last field
+     */
+    private static List<String> replaceLastField(
+            List<String> fields,
+            int maxFieldSize,
+            String text,
+            int lastFieldStartIndex,
+            int lastFieldEndIndex) {
+        List<String> newField = fields.subList(0, maxFieldSize - 1);
+        String last = text.substring(lastFieldStartIndex, lastFieldEndIndex);
+        newField.add(last);
+        return newField;
+    }
+
     /**
      * Concat the given fields.
      *
diff --git 
a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
 
b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
index 714652664e..fc64811a97 100644
--- 
a/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
+++ 
b/inlong-sort/sort-formats/format-common/src/test/java/org/apache/inlong/sort/formats/common/StringUtilsTest.java
@@ -112,4 +112,45 @@ public class StringUtilsTest {
         assertEquals("home", csv1Array2[2][1]);
         assertEquals("home", csv1Array2[2][2]);
     }
+
+    @Test
+    public void testSplitCsvStringWithMaxFields() {
+
+        String csvString = 
"name|age=20\\||&'\n\name|age=20\\||&'\n\n|home|\\home\\";
+        String[][] csv1Array0 = StringUtils.splitCsv(csvString, '|',
+                '\\', '\'', '\n', false, 0);
+        assertEquals(0, csv1Array0.length);
+
+        String[][] csv1Array1 = StringUtils.splitCsv(csvString, '|',
+                '\\', '\'', '\n', false, 1);
+        assertEquals("name|age=20\\||&'\n\name|age=20\\||&'", 
csv1Array1[0][0]);
+        assertEquals("", csv1Array1[1][0]);
+        assertEquals("|home|\\home\\", csv1Array1[2][0]);
+
+        String[][] csv1Array2 = StringUtils.splitCsv(csvString, '|',
+                '\\', '\'', '\n', false, 2);
+        assertEquals("name", csv1Array2[0][0]);
+        assertEquals("age=20\\||&'\n\name|age=20\\||&'", csv1Array2[0][1]);
+        assertEquals("", csv1Array2[1][0]);
+        assertEquals("", csv1Array2[2][0]);
+        assertEquals("home|\\home\\", csv1Array2[2][1]);
+
+        String[][] csv1Array3 = StringUtils.splitCsv(csvString, '|',
+                '\\', '\'', '\n', false, 3);
+        assertEquals("name", csv1Array3[0][0]);
+        assertEquals("age=20|", csv1Array3[0][1]);
+        assertEquals("&\n\name|age=20\\||&", csv1Array3[0][2]);
+        assertEquals("", csv1Array3[2][0]);
+        assertEquals("home", csv1Array3[2][1]);
+        assertEquals("home", csv1Array3[2][2]);
+
+        String[][] csv1Array4 = StringUtils.splitCsv(csvString, '|',
+                '\\', '\'', '\n', false, 4);
+        assertEquals("name", csv1Array4[0][0]);
+        assertEquals("age=20|", csv1Array4[0][1]);
+        assertEquals("&\n\name|age=20\\||&", csv1Array4[0][2]);
+        assertEquals("", csv1Array4[2][0]);
+        assertEquals("home", csv1Array4[2][1]);
+        assertEquals("home", csv1Array4[2][2]);
+    }
 }

Reply via email to