zhipeng93 commented on a change in pull request #28: URL: https://github.com/apache/flink-ml/pull/28#discussion_r758968020
########## File path: flink-ml-lib/src/main/java/org/apache/flink/ml/classification/linear/LogisticRegression.java ########## @@ -0,0 +1,653 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.ml.classification.linear; + +import org.apache.flink.api.common.functions.RichMapFunction; +import org.apache.flink.api.common.state.ListState; +import org.apache.flink.api.common.state.ListStateDescriptor; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeutils.base.DoubleComparator; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.api.java.typeutils.TupleTypeInfo; +import org.apache.flink.iteration.DataStreamList; +import org.apache.flink.iteration.IterationBody; +import org.apache.flink.iteration.IterationBodyResult; +import org.apache.flink.iteration.IterationConfig; +import org.apache.flink.iteration.IterationConfig.OperatorLifeCycle; +import org.apache.flink.iteration.IterationListener; +import org.apache.flink.iteration.Iterations; +import org.apache.flink.iteration.ReplayableDataStreamList; +import org.apache.flink.iteration.operator.OperatorStateUtils; +import org.apache.flink.ml.api.Estimator; +import org.apache.flink.ml.common.broadcast.BroadcastUtils; +import org.apache.flink.ml.common.datastream.DataStreamUtils; +import org.apache.flink.ml.common.iteration.TerminateOnMaxIterOrTol; +import org.apache.flink.ml.linalg.BLAS; +import org.apache.flink.ml.param.Param; +import org.apache.flink.ml.util.ParamUtils; +import org.apache.flink.ml.util.ReadWriteUtils; +import org.apache.flink.runtime.state.StateInitializationContext; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedMultiInput; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.TwoInputStreamOperator; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.table.api.internal.TableImpl; +import org.apache.flink.util.Collector; +import org.apache.flink.util.OutputTag; +import org.apache.flink.util.Preconditions; + +import org.apache.commons.collections.IteratorUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +/** + * This class implements methods to train a logistic regression model. For details, see + * https://en.wikipedia.org/wiki/Logistic_regression. + */ +public class LogisticRegression + implements Estimator<LogisticRegression, LogisticRegressionModel>, + LogisticRegressionParams<LogisticRegression> { + + private Map<Param<?>, Object> paramMap = new HashMap<>(); + + private static final OutputTag<Tuple2<double[], double[]>> MODEL_OUTPUT = + new OutputTag<Tuple2<double[], double[]>>("MODEL_OUTPUT") {}; + + public LogisticRegression() { + ParamUtils.initializeMapWithDefaultValues(this.paramMap, this); + } + + @Override + public Map<Param<?>, Object> getParamMap() { + return paramMap; + } + + @Override + public void save(String path) throws IOException { + ReadWriteUtils.saveMetadata(this, path); + } + + public static LogisticRegression load(StreamExecutionEnvironment env, String path) + throws IOException { + return ReadWriteUtils.loadStageParam(path); + } + + @Override + @SuppressWarnings("unchecked") + public LogisticRegressionModel fit(Table... inputs) { + Preconditions.checkArgument(inputs.length == 1); + StreamTableEnvironment tEnv = + (StreamTableEnvironment) ((TableImpl) inputs[0]).getTableEnvironment(); + + DataStream<Tuple3<Double, Double, double[]>> trainData = + tEnv.toDataStream(inputs[0]) + .map( + dataPoint -> + Tuple3.of( + getWeightCol() == null + ? new Double(1.0) + : (Double) + dataPoint.getField(getWeightCol()), + (Double) dataPoint.getField(getLabelCol()), + (double[]) dataPoint.getField(getFeaturesCol()))) + .returns( + new TupleTypeInfo<>( + BasicTypeInfo.DOUBLE_TYPE_INFO, + BasicTypeInfo.DOUBLE_TYPE_INFO, + PrimitiveArrayTypeInfo.DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO)); + + DataStream<Double> distinctLabelValues = + DataStreamUtils.sortPartition( + DataStreamUtils.distinct(trainData.map(dataPoint -> dataPoint.f1)), + new DoubleComparator(true)); + final String broadcastLabelsName = "broadcastLabels"; + trainData = + BroadcastUtils.withBroadcastStream( + Collections.singletonList(trainData), + Collections.singletonMap(broadcastLabelsName, distinctLabelValues), + inputList -> { + DataStream data = inputList.get(0); + return data.transform( + "preProcess", + new TupleTypeInfo<>( + BasicTypeInfo.DOUBLE_TYPE_INFO, + BasicTypeInfo.DOUBLE_TYPE_INFO, + PrimitiveArrayTypeInfo + .DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO), + new PreprocessDataOp( + new PreprocessOneRecord(broadcastLabelsName))); + }); + + DataStream<double[]> initModel = + trainData + .transform( + "genInitModel", + PrimitiveArrayTypeInfo.DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO, + new GenInitModel()) + .returns(PrimitiveArrayTypeInfo.DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO); + + DataStream<Tuple2<double[], double[]>> modelAndLoss = train(trainData, initModel); + + DataStream<LogisticRegressionModelData> modelData = + modelAndLoss + .connect(distinctLabelValues) + .transform( + "composeModelData", + TypeInformation.of(LogisticRegressionModelData.class), + new ComposeModelDataOp()) + .setParallelism(1); + + LogisticRegressionModel model = + new LogisticRegressionModel().setModelData(tEnv.fromDataStream(modelData)); + ReadWriteUtils.updateExistingParams(model, paramMap); + return model; + } + + /** Pre-processes the training data. */ + private static class PreprocessDataOp + extends AbstractUdfStreamOperator< + Tuple3<Double, Double, double[]>, + RichMapFunction< + Tuple3<Double, Double, double[]>, Tuple3<Double, Double, double[]>>> + implements OneInputStreamOperator< + Tuple3<Double, Double, double[]>, Tuple3<Double, Double, double[]>> { + public PreprocessDataOp( + RichMapFunction<Tuple3<Double, Double, double[]>, Tuple3<Double, Double, double[]>> + userFunction) { + super(userFunction); + } + + @Override + public void processElement(StreamRecord<Tuple3<Double, Double, double[]>> streamRecord) + throws Exception { + streamRecord.replace(userFunction.map(streamRecord.getValue())); + output.collect(streamRecord); + } + } + + /** Pre-processes one training sample. */ + private static class PreprocessOneRecord + extends RichMapFunction< + Tuple3<Double, Double, double[]>, Tuple3<Double, Double, double[]>> { + + String broadcastLabelsName; + + double[] distinctLabelValues; + + public PreprocessOneRecord(String broadcastLabelsName) { + this.broadcastLabelsName = broadcastLabelsName; + } + + @Override + public Tuple3<Double, Double, double[]> map(Tuple3<Double, Double, double[]> value) { + if (distinctLabelValues == null) { + List<Double> labelList = + getRuntimeContext().getBroadcastVariable(broadcastLabelsName); + distinctLabelValues = labelList.stream().mapToDouble(Double::doubleValue).toArray(); + } + // label mapping + value.f1 = Math.abs(value.f1 - distinctLabelValues[0]) < 1e-7 ? 1. : -1.; + return value; + } + } + + /** + * Generates initialized model. Note that the parallelism of model is same as the input train + * data, not one. + */ + private static class GenInitModel extends AbstractStreamOperator<double[]> + implements OneInputStreamOperator<Tuple3<Double, Double, double[]>, double[]>, + BoundedOneInput { + + private int dim = 0; + + private ListState<Integer> dimState; + + @Override + public void endInput() { + output.collect(new StreamRecord<>(new double[dim])); + } + + @Override + public void processElement(StreamRecord<Tuple3<Double, Double, double[]>> streamRecord) { + dim = Math.max(dim, streamRecord.getValue().f2.length); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + dimState = + context.getOperatorStateStore() + .getListState( + new ListStateDescriptor<>( + "dimState", BasicTypeInfo.INT_TYPE_INFO)); + dim = OperatorStateUtils.getUniqueElement(dimState, "dimState").orElse(0); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + dimState.clear(); + dimState.add(dim); + } + } + + /** + * Does machine learning training on the input data with the initialized model, return the + * trained model and losses. + * + * @param trainData The training data. + * @param initModel The initialized model. + * @return The trained model and losses during the training. + */ + private DataStream<Tuple2<double[], double[]>> train( + DataStream<Tuple3<Double, Double, double[]>> trainData, + DataStream<double[]> initModel) { + LogisticGradient logisticGradient = new LogisticGradient(getReg()); + DataStreamList resultList = + Iterations.iterateBoundedStreamsUntilTermination( + DataStreamList.of(initModel), + ReplayableDataStreamList.notReplay(trainData), + IterationConfig.newBuilder() + .setOperatorLifeCycle(OperatorLifeCycle.ALL_ROUND) + .build(), + new TrainIterationBody( + logisticGradient, + getGlobalBatchSize(), + getLearningRate(), + getMaxIter(), + getTol())); + return resultList.get(0); + } + + /** The iteration implementation for training process. */ + private static class TrainIterationBody implements IterationBody { + + private final LogisticGradient logisticGradient; + + private final int globalBatchSize; + + private final double learningRate; + + private final int maxIter; + + private final double tol; + + public TrainIterationBody( + LogisticGradient logisticGradient, + int globalBatchSize, + double learningRate, + int maxIter, + double tol) { + this.logisticGradient = logisticGradient; + this.globalBatchSize = globalBatchSize; + this.learningRate = learningRate; + this.maxIter = maxIter; + this.tol = tol; + } + + @Override + public IterationBodyResult process( + DataStreamList variableStreams, DataStreamList dataStreams) { + DataStream<double[]> initModelOrGradientsAndLoss = variableStreams.get(0); + DataStream<Tuple3<Double, Double, double[]>> trainData = dataStreams.get(0); + SingleOutputStreamOperator<double[]> gradientAndLoss = + trainData + .connect(initModelOrGradientsAndLoss) + .transform( + "updateModelAndComputeGradients", + PrimitiveArrayTypeInfo.DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO, + new CacheDataAndUpdateModelAndComputeGradient( + logisticGradient, + globalBatchSize, + learningRate, + maxIter, + tol)); + + DataStreamList reducedGradientAndLoss = + IterationBody.forEachRound( + DataStreamList.of(gradientAndLoss), + input -> { + DataStream<double[]> feedback = + DataStreamUtils.allReduceSum(input.get(0)); + return DataStreamList.of(feedback); + }); + DataStream<Integer> terminationCriteria = + reducedGradientAndLoss + .get(0) + .map( + x -> { + double[] value = (double[]) x; + return value[value.length - 1] / value[value.length - 2]; + }) + .flatMap(new TerminateOnMaxIterOrTol(maxIter, tol)); + + return new IterationBodyResult( + DataStreamList.of(reducedGradientAndLoss.get(0)), + DataStreamList.of(gradientAndLoss.getSideOutput(MODEL_OUTPUT)), + terminationCriteria); + } + } + + /** + * A stream operator that caches the training data in the first iteration and updates model + * using gradients iteratively. The first input is the training data, and the second input is + * the initialized model or feedback of gradient and loss. + */ + private static class CacheDataAndUpdateModelAndComputeGradient + extends AbstractStreamOperator<double[]> + implements TwoInputStreamOperator<Tuple3<Double, Double, double[]>, double[], double[]>, + IterationListener<double[]> { + + double[] coefficient; + + double[] gradient; + + int dim; + + int globalBatchSize; + + int localBatchSize; + + double learningRate; + + int maxIteration; + + double tol; + + private final LogisticGradient logisticGradient; + + /** TODO: add a more efficient sampling method. */ + List<Tuple3<Double, Double, double[]>> cachedTrainData; + + List<Tuple3<Double, Double, double[]>> batchData; + + Random random = new Random(2021); + + /** The buffer for feedback record: {coefficient, weightSum, loss}. */ + double[] feedbackBuffer; + + ListState<Double> lossState; + + ListState<double[]> coefficientState; + + ListState<Tuple3<Double, Double, double[]>> trainDataState; + + ListState<double[]> feedbackBufferState; + + public CacheDataAndUpdateModelAndComputeGradient( + LogisticGradient logisticGradient, + int globalBatchSize, + double learningRate, + int maxIteration, + double tol) { + this.logisticGradient = logisticGradient; + this.globalBatchSize = globalBatchSize; + this.batchData = new ArrayList<>(globalBatchSize); + this.learningRate = learningRate; + this.maxIteration = maxIteration; + this.tol = tol; + } + + @Override + public void open() { + int numTasks = getRuntimeContext().getNumberOfParallelSubtasks(); + int taskId = getRuntimeContext().getIndexOfThisSubtask(); + localBatchSize = globalBatchSize / numTasks; + if (globalBatchSize % numTasks > taskId) { + localBatchSize++; + } + } + + private List<Tuple3<Double, Double, double[]>> prepareBatchData( + List<Tuple3<Double, Double, double[]>> cachedData, int batchSize) { + batchData.clear(); + for (int i = 0; i < batchSize; i++) { + batchData.add(cachedData.get(random.nextInt(cachedData.size()))); + } + return batchData; + } + + private void updateModel() throws Exception { + System.arraycopy(feedbackBuffer, 0, gradient, 0, gradient.length); + double weightSum = feedbackBuffer[dim]; + double loss = feedbackBuffer[dim + 1] / weightSum; + lossState.add(loss); + BLAS.axpy(-learningRate / weightSum, gradient, coefficient); + } + + @Override + public void onEpochWatermarkIncremented( + int epochWatermark, Context context, Collector<double[]> collector) { + // TODO: let this method throws exception. + if (epochWatermark == 0) { + // initialize model and allocate memory + coefficient = feedbackBuffer; + dim = coefficient.length; + feedbackBuffer = new double[dim + 2]; + gradient = new double[dim]; + } else { + try { + updateModel(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + Arrays.fill(gradient, 0); + try { + if (cachedTrainData == null) { + cachedTrainData = IteratorUtils.toList(trainDataState.get().iterator()); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + batchData = prepareBatchData(cachedTrainData, localBatchSize); + Tuple2<Double, Double> lossAndWeightSum = + logisticGradient.computeLoss(batchData, coefficient); + logisticGradient.computeGradient(batchData, coefficient, gradient); + System.arraycopy(gradient, 0, feedbackBuffer, 0, gradient.length); + feedbackBuffer[dim] = lossAndWeightSum.f1; + feedbackBuffer[dim + 1] = lossAndWeightSum.f0; + collector.collect(feedbackBuffer); + } + + @Override + @SuppressWarnings("unchecked") + public void onIterationTerminated(Context context, Collector collector) { + // TODO: let this method throws exception. + // Updates model using the feedback buffer. + // Note that the gradients are received but onEpochWatermarkIncremented() is not + // invoked when we met termination condition. + trainDataState.clear(); + coefficientState.clear(); + try { + if (getRuntimeContext().getIndexOfThisSubtask() == 0) { + updateModel(); + double[] loss = + ((List<Double>) IteratorUtils.toList(lossState.get().iterator())) + .stream().mapToDouble(Double::doubleValue).toArray(); + context.output(MODEL_OUTPUT, Tuple2.of(this.coefficient, loss)); + } + lossState.clear(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public void processElement1(StreamRecord<Tuple3<Double, Double, double[]>> streamRecord) + throws Exception { + trainDataState.add(streamRecord.getValue()); + } + + @Override + public void processElement2(StreamRecord<double[]> streamRecord) { + feedbackBuffer = streamRecord.getValue(); + } + + @Override + public void initializeState(StateInitializationContext context) throws Exception { + super.initializeState(context); + trainDataState = + context.getOperatorStateStore() + .getListState( + new ListStateDescriptor<>( + "trainDataState", + new TupleTypeInfo<>( + BasicTypeInfo.DOUBLE_TYPE_INFO, + BasicTypeInfo.DOUBLE_TYPE_INFO, + PrimitiveArrayTypeInfo + .DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO))); + + lossState = + context.getOperatorStateStore() + .getListState( + new ListStateDescriptor<>( + "lossState", BasicTypeInfo.DOUBLE_TYPE_INFO)); + coefficientState = + context.getOperatorStateStore() + .getListState( + new ListStateDescriptor<>( + "coefficientState", + PrimitiveArrayTypeInfo + .DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO)); + OperatorStateUtils.getUniqueElement(coefficientState, "coefficientState") + .ifPresent(x -> coefficient = x); + feedbackBufferState = + context.getOperatorStateStore() + .getListState( + new ListStateDescriptor<>( + "feedbackBufferState", + PrimitiveArrayTypeInfo + .DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO)); + OperatorStateUtils.getUniqueElement(feedbackBufferState, "feedbackBufferState") + .ifPresent(x -> feedbackBuffer = x); + // allocate memory for gradient, initialize dim. + if (coefficient != null) { + dim = coefficient.length; + gradient = new double[dim]; + } + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + coefficientState.clear(); + if (coefficient != null) { + coefficientState.add(coefficient); + } + feedbackBufferState.clear(); + if (feedbackBuffer != null) { + feedbackBufferState.add(feedbackBuffer); + } + } + } + + /** Composes the final model data. */ + private static class ComposeModelDataOp + extends AbstractStreamOperator<LogisticRegressionModelData> + implements TwoInputStreamOperator< + Tuple2<double[], double[]>, Double, LogisticRegressionModelData>, + BoundedMultiInput { + + ListState<Tuple2<double[], double[]>> coefficientAndLossState; + + ListState<Integer> finishedInputState; + + ListState<Double> labelsState; + + @Override + @SuppressWarnings("unchecked") + public void endInput(int i) throws Exception { Review comment: In this function, I need to emit some elements at the end of input. If we use `AbstractStreamOperator::close`, can we achieve this? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org