zhipeng93 commented on a change in pull request #28: URL: https://github.com/apache/flink-ml/pull/28#discussion_r759012504
########## File path: flink-ml-lib/src/main/java/org/apache/flink/ml/classification/linear/LogisticRegression.java ########## @@ -0,0 +1,653 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.classification.linear; + +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeutils.base.DoubleComparator; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.api.java.typeutils.TupleTypeInfo; +import org.apache.flink.iteration.DataStreamList; +import org.apache.flink.iteration.IterationBody; +import org.apache.flink.iteration.IterationBodyResult; +import org.apache.flink.iteration.IterationConfig; +import org.apache.flink.iteration.IterationConfig.OperatorLifeCycle; +import org.apache.flink.iteration.IterationListener; +import org.apache.flink.iteration.Iterations; +import org.apache.flink.iteration.ReplayableDataStreamList; +import org.apache.flink.iteration.operator.OperatorStateUtils; +import org.apache.flink.ml.api.Estimator; +import org.apache.flink.ml.common.broadcast.BroadcastUtils; +import org.apache.flink.ml.common.datastream.DataStreamUtils; +import org.apache.flink.ml.common.iteration.TerminateOnMaxIterOrTol; +import org.apache.flink.ml.linalg.BLAS; +import org.apache.flink.ml.param.Param; +import org.apache.flink.ml.util.ParamUtils; +import org.apache.flink.ml.util.ReadWriteUtils; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedMultiInput; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.TwoInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.internal.TableImpl; +import org.apache.flink.util.Collector; +import org.apache.flink.util.OutputTag; +import org.apache.flink.util.Preconditions; + +import org.apache.commons.collections.IteratorUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +/** + * This class implements methods to train a logistic regression model. For details, see + * https://en.wikipedia.org/wiki/Logistic_regression. + */ +public class LogisticRegression + implements Estimator<LogisticRegression, LogisticRegressionModel>, + LogisticRegressionParams<LogisticRegression> { + + private Map<Param<?>, Object> paramMap = new HashMap<>(); + + private static final OutputTag<Tuple2<double[], double[]>> MODEL_OUTPUT = + new OutputTag<Tuple2<double[], double[]>>("MODEL_OUTPUT") {}; + + public LogisticRegression() { + ParamUtils.initializeMapWithDefaultValues(this.paramMap, this); + } + + @Override + public Map<Param<?>, Object> getParamMap() { + return paramMap; + } + + @Override + public void save(String path) throws IOException { + ReadWriteUtils.saveMetadata(this, path); + } + + public static LogisticRegression load(StreamExecutionEnvironment env, String path) + throws IOException { + return ReadWriteUtils.loadStageParam(path); + } + + @Override + @SuppressWarnings("unchecked") + public LogisticRegressionModel fit(Table... inputs) { + Preconditions.checkArgument(inputs.length == 1); + StreamTableEnvironment tEnv = + (StreamTableEnvironment) ((TableImpl) inputs[0]).getTableEnvironment(); + + DataStream<Tuple3<Double, Double, double[]>> trainData = + tEnv.toDataStream(inputs[0]) + .map( + dataPoint -> + Tuple3.of( + getWeightCol() == null + ? new Double(1.0) + : (Double) + dataPoint.getField(getWeightCol()), + (Double) dataPoint.getField(getLabelCol()), + (double[]) dataPoint.getField(getFeaturesCol()))) + .returns( + new TupleTypeInfo<>( + BasicTypeInfo.DOUBLE_TYPE_INFO, + BasicTypeInfo.DOUBLE_TYPE_INFO, + PrimitiveArrayTypeInfo.DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO)); + + DataStream<Double> distinctLabelValues = + DataStreamUtils.sortPartition( + DataStreamUtils.distinct(trainData.map(dataPoint -> dataPoint.f1)), + new DoubleComparator(true)); + final String broadcastLabelsName = "broadcastLabels"; + trainData = + BroadcastUtils.withBroadcastStream( + Collections.singletonList(trainData), + Collections.singletonMap(broadcastLabelsName, distinctLabelValues), + inputList -> { + DataStream data = inputList.get(0); + return data.transform( + "preProcess", + new TupleTypeInfo<>( + BasicTypeInfo.DOUBLE_TYPE_INFO, + BasicTypeInfo.DOUBLE_TYPE_INFO, + PrimitiveArrayTypeInfo + .DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO), + new PreprocessDataOp( + new PreprocessOneRecord(broadcastLabelsName))); + }); + + DataStream<double[]> initModel = + trainData + .transform( + "genInitModel", + PrimitiveArrayTypeInfo.DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO, + new GenInitModel()) + .returns(PrimitiveArrayTypeInfo.DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO); + + DataStream<Tuple2<double[], double[]>> modelAndLoss = train(trainData, initModel); + + DataStream<LogisticRegressionModelData> modelData = + modelAndLoss + .connect(distinctLabelValues) + .transform( + "composeModelData", + TypeInformation.of(LogisticRegressionModelData.class), + new ComposeModelDataOp()) + .setParallelism(1); + + LogisticRegressionModel model = + new LogisticRegressionModel().setModelData(tEnv.fromDataStream(modelData)); + ReadWriteUtils.updateExistingParams(model, paramMap); + return model; + } + + /** Pre-processes the training data. */ + private static class PreprocessDataOp + extends AbstractUdfStreamOperator< + Tuple3<Double, Double, double[]>, + RichMapFunction< + Tuple3<Double, Double, double[]>, Tuple3<Double, Double, double[]>>> + implements OneInputStreamOperator< + Tuple3<Double, Double, double[]>, Tuple3<Double, Double, double[]>> { + public PreprocessDataOp( + RichMapFunction<Tuple3<Double, Double, double[]>, Tuple3<Double, Double, double[]>> + userFunction) { + super(userFunction); + } + + @Override + public void processElement(StreamRecord<Tuple3<Double, Double, double[]>> streamRecord) + throws Exception { + streamRecord.replace(userFunction.map(streamRecord.getValue())); + output.collect(streamRecord); + } + } + + /** Pre-processes one training sample. */ + private static class PreprocessOneRecord + extends RichMapFunction< + Tuple3<Double, Double, double[]>, Tuple3<Double, Double, double[]>> { + + String broadcastLabelsName; + + double[] distinctLabelValues; + + public PreprocessOneRecord(String broadcastLabelsName) { + this.broadcastLabelsName = broadcastLabelsName; + } + + @Override + public Tuple3<Double, Double, double[]> map(Tuple3<Double, Double, double[]> value) { + if (distinctLabelValues == null) { + List<Double> labelList = + getRuntimeContext().getBroadcastVariable(broadcastLabelsName); + distinctLabelValues = labelList.stream().mapToDouble(Double::doubleValue).toArray(); + } + // label mapping + value.f1 = Math.abs(value.f1 - distinctLabelValues[0]) < 1e-7 ? 1. : -1.; + return value; + } + } + + /** + * Generates initialized model. Note that the parallelism of model is same as the input train + * data, not one. + */ + private static class GenInitModel extends AbstractStreamOperator<double[]> + implements OneInputStreamOperator<Tuple3<Double, Double, double[]>, double[]>, + BoundedOneInput { + + private int dim = 0; + + private ListState<Integer> dimState; + + @Override + public void endInput() { + output.collect(new StreamRecord<>(new double[dim])); + } + + @Override + public void processElement(StreamRecord<Tuple3<Double, Double, double[]>> streamRecord) { + dim = Math.max(dim, streamRecord.getValue().f2.length); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + dimState = + context.getOperatorStateStore() + .getListState( + new ListStateDescriptor<>( + "dimState", BasicTypeInfo.INT_TYPE_INFO)); + dim = OperatorStateUtils.getUniqueElement(dimState, "dimState").orElse(0); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + dimState.clear(); + dimState.add(dim); + } + } + + /** + * Does machine learning training on the input data with the initialized model, return the + * trained model and losses. + * + * @param trainData The training data. + * @param initModel The initialized model. + * @return The trained model and losses during the training. + */ + private DataStream<Tuple2<double[], double[]>> train( + DataStream<Tuple3<Double, Double, double[]>> trainData, + DataStream<double[]> initModel) { + LogisticGradient logisticGradient = new LogisticGradient(getReg()); + DataStreamList resultList = + Iterations.iterateBoundedStreamsUntilTermination( + DataStreamList.of(initModel), + ReplayableDataStreamList.notReplay(trainData), + IterationConfig.newBuilder() + .setOperatorLifeCycle(OperatorLifeCycle.ALL_ROUND) + .build(), + new TrainIterationBody( + logisticGradient, + getGlobalBatchSize(), + getLearningRate(), + getMaxIter(), + getTol())); + return resultList.get(0); + } + + /** The iteration implementation for training process. */ + private static class TrainIterationBody implements IterationBody { + + private final LogisticGradient logisticGradient; + + private final int globalBatchSize; + + private final double learningRate; + + private final int maxIter; + + private final double tol; + + public TrainIterationBody( + LogisticGradient logisticGradient, + int globalBatchSize, + double learningRate, + int maxIter, + double tol) { + this.logisticGradient = logisticGradient; + this.globalBatchSize = globalBatchSize; + this.learningRate = learningRate; + this.maxIter = maxIter; + this.tol = tol; + } + + @Override + public IterationBodyResult process( + DataStreamList variableStreams, DataStreamList dataStreams) { + DataStream<double[]> initModelOrGradientsAndLoss = variableStreams.get(0); Review comment: > From the IterationBody's performance, it won't know whether this is initial value or the feedback value. On the first iteration, we get the initial value. Later, we get the feedback value. We can know what is the initial value > I believe we in general expect the content in the initial variable stream to have exactly the same semantic meaning as the feedback variable stream. It seems a bit counter-intuitive to say the initial value is Model and the feedback value is GradientAndLoss In expection yes, the initial variable should have exactly the same meaning as the feedback variable. However, a common use case for iterations in machine learning is that, we have a initialized model, then we communicate with each other by sending gradients. If we want to implement this pattern in the new iteration, we have to allow that the init variable have different semantics with the feedback variable. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org