yunfengzhou-hub commented on a change in pull request #32:
URL: https://github.com/apache/flink-ml/pull/32#discussion_r753077392



##########
File path: 
flink-ml-lib/src/test/java/org/apache/flink/ml/classification/NaiveBayesTest.java
##########
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.ml.classification;
+
+import org.apache.flink.api.common.eventtime.WatermarkStrategy;
+import org.apache.flink.api.common.restartstrategy.RestartStrategies;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.ml.classification.naivebayes.NaiveBayes;
+import org.apache.flink.ml.classification.naivebayes.NaiveBayesModel;
+import org.apache.flink.ml.classification.naivebayes.NaiveBayesModelData;
+import org.apache.flink.ml.linalg.DenseVector;
+import org.apache.flink.ml.linalg.Vectors;
+import org.apache.flink.ml.util.ReadWriteUtils;
+import 
org.apache.flink.streaming.api.environment.ExecutionCheckpointingOptions;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.apache.flink.table.api.DataTypes;
+import org.apache.flink.table.api.Schema;
+import org.apache.flink.table.api.Table;
+import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
+import org.apache.flink.types.Row;
+
+import org.apache.commons.collections.IteratorUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.apache.flink.table.api.Expressions.$;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+/** Tests {@link NaiveBayes} and {@link NaiveBayesModel}. */
+public class NaiveBayesTest {
+    private StreamExecutionEnvironment env;
+    private StreamTableEnvironment tEnv;
+    private Schema schema;
+    private Row[] trainData;
+    private Row[] predictData;
+    private Row[] expectedOutput;
+    private String featuresCol;
+    private String labelCol;
+    private String predictCol;
+    private String modelType;
+    private double smoothing;
+    private boolean isSaveLoad;
+    private String errorMessage;
+
+    @Before
+    public void before() {
+        Configuration config = new Configuration();
+        
config.set(ExecutionCheckpointingOptions.ENABLE_CHECKPOINTS_AFTER_TASKS_FINISH, 
true);
+        env = StreamExecutionEnvironment.getExecutionEnvironment(config);
+        env.setParallelism(4);
+        env.enableCheckpointing(100);
+        env.setRestartStrategy(RestartStrategies.noRestart());
+        tEnv = StreamTableEnvironment.create(env);
+
+        schema =
+                Schema.newBuilder()
+                        .column("f0", DataTypes.DOUBLE())
+                        .column("f1", DataTypes.of(DenseVector.class))
+                        .column("f2", DataTypes.DOUBLE())
+                        .columnByMetadata("rowtime", "TIMESTAMP_LTZ(3)")
+                        .watermark("rowtime", "SOURCE_WATERMARK()")
+                        .build();
+
+        trainData =
+                new Row[] {
+                    Row.of(1., Vectors.dense(1, 1., 1., 1., 2.), 11.0),
+                    Row.of(1., Vectors.dense(1, 1., 0., 1., 2.), 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1., 3.), 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1.5, 2.), 11.0),
+                    Row.of(2., Vectors.dense(3, 1.5, 1., 0.5, 3.), 10.0),
+                    Row.of(1., Vectors.dense(1, 1., 1.5, 0., 1.), 10.0),
+                    Row.of(2., Vectors.dense(4, 1., 1., 0., 1.), 10.0)
+                };
+
+        predictData = trainData;
+
+        expectedOutput =
+                new Row[] {
+                    Row.of(1., Vectors.dense(1, 1., 1., 1., 2.), 11.0, 11.0),
+                    Row.of(1., Vectors.dense(1, 1., 0., 1., 2.), 11.0, 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1., 3.), 11.0, 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1.5, 2.), 11.0, 11.0),
+                    Row.of(2., Vectors.dense(3, 1.5, 1., 0.5, 3.), 10.0, 10.0),
+                    Row.of(1., Vectors.dense(1, 1., 1.5, 0., 1.), 10.0, 10.0),
+                    Row.of(2., Vectors.dense(4, 1., 1., 0., 1.), 10.0, 10.0)
+                };
+
+        featuresCol = "features";
+        labelCol = "label";
+        predictCol = "predict";
+        modelType = "multinomial";
+        smoothing = 1.0;
+        isSaveLoad = false;
+    }
+
+    @Test
+    public void testParam() {
+        NaiveBayes estimator = new NaiveBayes();
+
+        assertEquals("features", estimator.getFeaturesCol());
+        assertEquals("label", estimator.getLabelCol());
+        assertEquals("multinomial", estimator.getModelType());
+        assertEquals("prediction", estimator.getPredictionCol());
+        assertEquals(1.0, estimator.getSmoothing(), 1e-5);
+
+        estimator
+                .setFeaturesCol("test_feature")
+                .setLabelCol("test_label")
+                .setPredictionCol("test_prediction")
+                .setSmoothing(2.0);
+
+        assertEquals("test_feature", estimator.getFeaturesCol());
+        assertEquals("test_label", estimator.getLabelCol());
+        assertEquals("test_prediction", estimator.getPredictionCol());
+        assertEquals(2.0, estimator.getSmoothing(), 1e-5);
+
+        NaiveBayesModel model = new NaiveBayesModel();
+
+        assertEquals("features", model.getFeaturesCol());
+        assertEquals("multinomial", model.getModelType());
+        assertEquals("prediction", model.getPredictionCol());
+
+        
model.setFeaturesCol("test_feature").setPredictionCol("test_prediction");
+
+        assertEquals("test_feature", model.getFeaturesCol());
+        assertEquals("test_prediction", model.getPredictionCol());
+    }
+
+    @Test
+    public void testNaiveBayes() throws Exception {
+        errorMessage = "normal test for Naive Bayes";
+        runAndCheck();
+    }
+
+    @Test(expected = Exception.class)
+    public void testPredictUnseenFeature() throws Exception {
+        errorMessage =
+                "Naive Bayes should throw exception if unseen feature values 
are met in prediction "
+                        + "and the model type is categorical.";
+        predictData =
+                new Row[] {
+                    Row.of(1., Vectors.dense(5, 1., 1., 1., 2.), 11.0),
+                    Row.of(1., Vectors.dense(5, 1., 0., 1., 2.), 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1., 3.), 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1.5, 2.), 11.0),
+                    Row.of(2., Vectors.dense(3, 1.5, 1., 0.5, 3.), 10.0),
+                    Row.of(1., Vectors.dense(1, 1., 1.5, 0., 1.), 10.0),
+                    Row.of(2., Vectors.dense(4, 1., 1., 0., 1.), 10.0)
+                };
+        runAndCheck();
+    }
+
+    @Test(expected = Exception.class)
+    public void testVectorWithDiffLen() throws Exception {
+        errorMessage =
+                "Naive Bayes should throw exception if length of feature 
vectors are not equal.";
+        trainData =
+                new Row[] {
+                    Row.of(1., Vectors.dense(1, 1., 1., 1.), 11.0),
+                    Row.of(1., Vectors.dense(1, 1., 0., 1., 2.), 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1., 3.), 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1.5, 2.), 11.0),
+                    Row.of(2., Vectors.dense(3, 1.5, 1., 0.5, 3.), 10.0),
+                    Row.of(1., Vectors.dense(1, 1., 1.5, 0., 1.), 10.0),
+                    Row.of(2., Vectors.dense(4, 1., 1., 0., 1.), 10.0)
+                };
+        runAndCheck();
+    }
+
+    @Test(expected = Exception.class)
+    public void testVectorWithDiffLen2() throws Exception {
+        errorMessage =
+                "Naive Bayes should throw exception if length of feature 
vectors are not equal.";
+        trainData =
+                new Row[] {
+                    Row.of(1., Vectors.dense(1, 1., 1., 1.), 11.0),
+                    Row.of(1., Vectors.dense(1, 1., 0., 1.), 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1.), 11.0),
+                    Row.of(1., Vectors.dense(2, 0., 1., 1.5), 11.0),
+                    Row.of(2., Vectors.dense(3, 1.5, 1., 0.5, 3.), 10.0),
+                    Row.of(1., Vectors.dense(1, 1., 1.5, 0., 1.), 10.0),
+                    Row.of(2., Vectors.dense(4, 1., 1., 0., 1.), 10.0)
+                };
+        runAndCheck();
+    }
+
+    @Test
+    public void testSaveLoad() throws Exception {
+        errorMessage = "Naive Bayes should be able to save Model to filesystem 
and load correctly.";
+        isSaveLoad = true;
+        runAndCheck();
+    }
+
+    @Test
+    public void testGetModelData() throws Exception {
+        errorMessage =
+                "Naive Bayes should be able to provide model data with 
getModelData() method.";
+        trainData =
+                new Row[] {
+                    Row.of(1., Vectors.dense(1, 1.), 11.0), Row.of(2., 
Vectors.dense(2, 1.), 11.0)
+                };
+        NaiveBayesModel model = getModel();
+        NaiveBayesModelData expected =
+                new NaiveBayesModelData(
+                        new HashMap[][] {
+                            new HashMap[] {
+                                new HashMap<Double, Double>() {
+                                    {
+                                        put(2.0, -0.6931471805599453);
+                                        put(1.0, -0.6931471805599453);
+                                    }
+                                },
+                                new HashMap<Double, Double>() {
+                                    {
+                                        put(1.0, 0.0);
+                                    }
+                                },
+                            },
+                        },
+                        new double[] {0.0},
+                        new double[] {11.0});
+        NaiveBayesModelData actual =
+                NaiveBayesModelData.toDataStream(tEnv, model.getModelData()[0])
+                        .executeAndCollect()
+                        .next();
+        assertArrayEquals(errorMessage, expected.labels, actual.labels, 1e-5);
+        assertArrayEquals(errorMessage, expected.piArray, actual.piArray, 
1e-5);
+        assertArrayEquals(errorMessage, expected.theta[0], actual.theta[0]);
+    }
+
+    @Test
+    public void testSetModelData() throws Exception {
+        errorMessage =
+                "Naive Bayes should be able to provide model data with 
getModelData() method.";
+
+        NaiveBayesModel modelA = getModel();
+
+        Table modelData = modelA.getModelData()[0];
+        NaiveBayesModel modelB = new NaiveBayesModel().setModelData(modelData);
+        ReadWriteUtils.updateExistingParams(modelB, modelA.getParamMap());
+
+        checkResult(modelB);
+    }
+
+    private void runAndCheck() throws Exception {
+        NaiveBayesModel model = getModel();
+        checkResult(model);
+    }
+
+    private NaiveBayesModel getModel() throws Exception {
+        Table trainTable =
+                tEnv.fromDataStream(
+                                env.fromElements(trainData)
+                                        .assignTimestampsAndWatermarks(
+                                                
WatermarkStrategy.noWatermarks()),
+                                schema)
+                        .as("weight", "features", "label");
+
+        NaiveBayes estimator =
+                new NaiveBayes()
+                        .setSmoothing(smoothing)
+                        .setFeaturesCol(featuresCol)
+                        .setLabelCol(labelCol)
+                        .setPredictionCol(predictCol)
+                        .setModelType(modelType);
+
+        if (isSaveLoad) {
+            String tempDir = Files.createTempDirectory("").toString();
+            estimator.save(tempDir);
+            env.execute();
+
+            estimator = NaiveBayes.load(tempDir);
+        }
+
+        NaiveBayesModel model = estimator.fit(trainTable);
+
+        if (isSaveLoad) {
+            String tempDir = Files.createTempDirectory("").toString();
+            model.save(tempDir);
+            env.execute();
+
+            model = NaiveBayesModel.load(tempDir);
+        }
+
+        return model;
+    }
+
+    private void checkResult(NaiveBayesModel model) {
+        Table predictTable =
+                tEnv.fromDataStream(
+                                env.fromElements(predictData)
+                                        .assignTimestampsAndWatermarks(
+                                                
WatermarkStrategy.noWatermarks()),
+                                schema)
+                        .as("weight", "features", "label");
+
+        Table output =
+                model.transform(predictTable)[0].select(
+                        $("weight"), $("features"), $("label"), $("predict"));
+
+        Object[] actualObjects = 
IteratorUtils.toArray(output.execute().collect());
+        Row[] actual = new Row[actualObjects.length];
+        for (int i = 0; i < actualObjects.length; i++) {
+            actual[i] = (Row) actualObjects[i];
+        }
+
+        assertEquals(errorMessage, getFrequencyMap(expectedOutput), 
getFrequencyMap(actual));
+    }
+
+    private static Map<Object, Integer> getFrequencyMap(Row[] rows) {
+        Map<Object, Integer> map = new HashMap<>();
+        for (Row row : rows) {
+            List<Object> list = toList(row);
+            map.put(list, map.getOrDefault(list, 0) + 1);
+        }
+        return map;
+    }
+
+    private static List<Object> toList(Row row) {

Review comment:
       classes like this has been replaced by existing tools like 
`CollectionUtils` in `org.apache.commons`.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to