exceptionfactory commented on code in PR #9290: URL: https://github.com/apache/nifi/pull/9290#discussion_r1796211969
########## nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelReader.java: ########## @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nifi.excel; + +import org.apache.commons.io.IOUtils; +import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.controller.ConfigurationContext; +import org.apache.nifi.logging.ComponentLog; +import org.apache.nifi.schema.access.SchemaAccessUtils; +import org.apache.nifi.serialization.DateTimeUtils; +import org.apache.nifi.serialization.RecordReader; +import org.apache.nifi.serialization.record.DataType; +import org.apache.nifi.serialization.record.Record; +import org.apache.nifi.serialization.record.RecordFieldType; +import org.apache.nifi.serialization.record.RecordSchema; +import org.apache.nifi.util.MockConfigurationContext; +import org.apache.nifi.util.MockControllerServiceInitializationContext; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.sql.Date; +import java.sql.Time; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +@ExtendWith(MockitoExtension.class) +public class TestExcelReader { + + private static final ByteArrayOutputStream ALL_TYPES_CONTENTS = new ByteArrayOutputStream(); + private static final Map<String, Class<?>> FIELD_NAMES_AND_CLASS_TYPES = new LinkedHashMap<>(); + + @Mock + private ComponentLog logger; + + private ExcelReader excelReader; + private Map<PropertyDescriptor, String> properties; + + @BeforeAll + static void setUpBeforeAll() throws Exception { + InputStream inputStream = Files.newInputStream(Paths.get("src/test/resources/excel/all_types.xlsx")); + IOUtils.copyLarge(inputStream, ALL_TYPES_CONTENTS); + FIELD_NAMES_AND_CLASS_TYPES.put("Date_Custom", Date.class); + FIELD_NAMES_AND_CLASS_TYPES.put("Time_Custom", Time.class); + FIELD_NAMES_AND_CLASS_TYPES.put("Timestamp", Timestamp.class); + FIELD_NAMES_AND_CLASS_TYPES.put("Boolean_Function", Boolean.class); + FIELD_NAMES_AND_CLASS_TYPES.put("Number_Function", Integer.class); + FIELD_NAMES_AND_CLASS_TYPES.put("Blank_Cell", String.class); + FIELD_NAMES_AND_CLASS_TYPES.put("String", String.class); + FIELD_NAMES_AND_CLASS_TYPES.put("Numeric", Float.class); + FIELD_NAMES_AND_CLASS_TYPES.put("Variable_Missing", String.class); + FIELD_NAMES_AND_CLASS_TYPES.put("Syntax_Error", String.class); + } + + @BeforeEach + void setUp() { + excelReader = new ExcelReader(); + properties = new HashMap<>(); + new ExcelReader().getSupportedPropertyDescriptors().forEach(prop -> properties.put(prop, prop.getDefaultValue())); + } + + @Test + void testReadWithInferredSchema() throws Exception { + properties.put(DateTimeUtils.DATE_FORMAT, "M/dd/yy"); + properties.put(DateTimeUtils.TIME_FORMAT, "HH:mm"); + properties.put(DateTimeUtils.TIMESTAMP_FORMAT, "M/dd/yyyy HH:mm:ss"); Review Comment: It would be helpful to declare static values for the date, time, and timestamp formats, which can be reused across this class. ########## nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelRecordReader.java: ########## @@ -141,15 +140,15 @@ public void testMultipleRecordsSingleSheet() throws MalformedRecordException { .build(); ExcelRecordReader recordReader = new ExcelRecordReader(configuration, getInputStream(DATA_FORMATTING_FILE), logger); - List<Record> records = getRecords(recordReader, false, false); + List<Record> records = getRecords(recordReader, false); assertEquals(9, records.size()); } private RecordSchema getDataFormattingSchema() { final List<RecordField> fields = Arrays.asList( new RecordField("Numbers", RecordFieldType.DOUBLE.getDataType()), - new RecordField("Timestamps", RecordFieldType.DATE.getDataType()), + new RecordField("Timestamps", RecordFieldType.STRING.getDataType()), Review Comment: Is this type change just to simplify the test, with the understanding that `TestExcelReader` covers the various field types? ########## nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelUtils.java: ########## @@ -16,15 +16,45 @@ */ package org.apache.nifi.excel; +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.CellType; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.usermodel.DateUtil; import org.apache.poi.ss.usermodel.Row; +import java.util.Locale; + public class ExcelUtils { static final String FIELD_NAME_PREFIX = "column_"; + private static final DataFormatter DATA_FORMATTER = new DataFormatter(Locale.getDefault()); + + static { + DATA_FORMATTER.setUseCachedValuesForFormulaCells(true); + } private ExcelUtils() { } public static boolean hasCells(final Row row) { return row != null && row.getFirstCellNum() != -1; } + + public static String getFormattedCellValue(Cell cell) { + if (cell != null) { + /* DataFormatter should be used in most cases unless an exception will be thrown or + string localization may occur which could result in later exceptions. A cell which has a formula cannot + format a cell which has an error value and an exception will be thrown. In addition, a cell whose type + is numeric when formatted will take the current locale into consideration which could prevent later parsing + as a float/double.*/ + if (cell.getCellType().equals(CellType.FORMULA) && cell.getCachedFormulaResultType().equals(CellType.ERROR)) { + return cell.getStringCellValue(); + } else if (cell.getCellType().equals(CellType.NUMERIC) && !DateUtil.isCellDateFormatted(cell)) { + final double numericCellValue = cell.getNumericCellValue(); + return numericCellValue % 1 == 0 ? Long.toString((long) numericCellValue) : Double.toString(numericCellValue); Review Comment: Is there any other more optimized way to determine whether the value is a `long` or a `double` with having to do the modulo operation? ########## nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelUtils.java: ########## @@ -16,15 +16,45 @@ */ package org.apache.nifi.excel; +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.CellType; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.usermodel.DateUtil; import org.apache.poi.ss.usermodel.Row; +import java.util.Locale; + public class ExcelUtils { static final String FIELD_NAME_PREFIX = "column_"; + private static final DataFormatter DATA_FORMATTER = new DataFormatter(Locale.getDefault()); + + static { + DATA_FORMATTER.setUseCachedValuesForFormulaCells(true); + } private ExcelUtils() { } public static boolean hasCells(final Row row) { return row != null && row.getFirstCellNum() != -1; } + + public static String getFormattedCellValue(Cell cell) { + if (cell != null) { + /* DataFormatter should be used in most cases unless an exception will be thrown or + string localization may occur which could result in later exceptions. A cell which has a formula cannot + format a cell which has an error value and an exception will be thrown. In addition, a cell whose type + is numeric when formatted will take the current locale into consideration which could prevent later parsing + as a float/double.*/ + if (cell.getCellType().equals(CellType.FORMULA) && cell.getCachedFormulaResultType().equals(CellType.ERROR)) { + return cell.getStringCellValue(); + } else if (cell.getCellType().equals(CellType.NUMERIC) && !DateUtil.isCellDateFormatted(cell)) { Review Comment: It looks like it would be helpful to declare a variable for the `cellType`, making these conditionals a bit more readable than calling `getCellType()` in both blocks. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
