[ https://issues.apache.org/jira/browse/FLINK-3780?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15291227#comment-15291227 ]
ASF GitHub Bot commented on FLINK-3780: --------------------------------------- Github user vasia commented on a diff in the pull request: https://github.com/apache/flink/pull/1980#discussion_r63894239 --- Diff: flink-libraries/flink-gelly/src/main/java/org/apache/flink/graph/library/similarity/JaccardIndex.java --- @@ -0,0 +1,462 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.graph.library.similarity; + +import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.common.functions.FlatMapFunction; +import org.apache.flink.api.common.functions.GroupReduceFunction; +import org.apache.flink.api.common.operators.Order; +import org.apache.flink.api.java.DataSet; +import org.apache.flink.api.java.functions.FunctionAnnotation; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.api.java.tuple.Tuple4; +import org.apache.flink.graph.Edge; +import org.apache.flink.graph.Graph; +import org.apache.flink.graph.GraphAlgorithm; +import org.apache.flink.graph.asm.degree.annotate.undirected.EdgeTargetDegree; +import org.apache.flink.graph.library.similarity.JaccardIndex.Result; +import org.apache.flink.graph.utils.Murmur3_32; +import org.apache.flink.types.CopyableValue; +import org.apache.flink.types.IntValue; +import org.apache.flink.types.LongValue; +import org.apache.flink.util.Collector; +import org.apache.flink.util.Preconditions; + +import java.util.ArrayList; +import java.util.List; + +/** + * The Jaccard Index measures the similarity between vertex neighborhoods. + * Scores range from 0.0 (no common neighbors) to 1.0 (all neighbors are common). + * <br/> + * This implementation produces similarity scores for each pair of vertices + * in the graph with at least one common neighbor; equivalently, this is the + * set of all non-zero Jaccard Similarity coefficients. + * <br/> + * The input graph must be a simple, undirected graph containing no duplicate + * edges or self-loops. + * + * @param <K> graph ID type + * @param <VV> vertex value type + * @param <EV> edge value type + */ +public class JaccardIndex<K extends CopyableValue<K>, VV, EV> +implements GraphAlgorithm<K, VV, EV, DataSet<Result<K>>> { + + public static final int DEFAULT_GROUP_SIZE = 64; + + // Optional configuration + private int groupSize = DEFAULT_GROUP_SIZE; + + private boolean unboundedScores = true; + + private int minimumScoreNumerator = 0; + + private int minimumScoreDenominator = 1; + + private int maximumScoreNumerator = 1; + + private int maximumScoreDenominator = 0; + + private int littleParallelism = ExecutionConfig.PARALLELISM_UNKNOWN; + + /** + * Override the default group size for the quadratic expansion of neighbor + * pairs. Small groups generate more data whereas large groups distribute + * computation less evenly among tasks. + * + * @param groupSize the group size for the quadratic expansion of neighbor pairs + * @return this + */ + public JaccardIndex<K, VV, EV> setGroupSize(int groupSize) { + Preconditions.checkArgument(groupSize > 0, "Group size must be greater than zero"); + + this.groupSize = groupSize; + + return this; + } + + /** + * Filter out Jaccard Index scores less than the given minimum fraction. + * + * @param numerator numerator of the minimum score + * @param denominator denominator of the minimum score + * @return this + * @see #setMaximumScore(int, int) + */ + public JaccardIndex<K, VV, EV> setMinimumScore(int numerator, int denominator) { --- End diff -- This should be documented in the library usage docs too. > Jaccard Similarity > ------------------ > > Key: FLINK-3780 > URL: https://issues.apache.org/jira/browse/FLINK-3780 > Project: Flink > Issue Type: New Feature > Components: Gelly > Affects Versions: 1.1.0 > Reporter: Greg Hogan > Assignee: Greg Hogan > Fix For: 1.1.0 > > > Implement a Jaccard Similarity algorithm computing all non-zero similarity > scores. This algorithm is similar to {{TriangleListing}} but instead of > joining two-paths against an edge list we count two-paths. > {{flink-gelly-examples}} currently has {{JaccardSimilarityMeasure}} which > relies on {{Graph.getTriplets()}} so only computes similarity scores for > neighbors but not neighbors-of-neighbors. > This algorithm is easily modified for other similarity scores such as > Adamic-Adar similarity where the sum of endpoint degrees is replaced by the > degree of the middle vertex. -- This message was sent by Atlassian JIRA (v6.3.4#6332)