[ https://issues.apache.org/jira/browse/FLINK-3780?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15291281#comment-15291281 ]
ASF GitHub Bot commented on FLINK-3780: --------------------------------------- Github user greghogan commented on a diff in the pull request: https://github.com/apache/flink/pull/1980#discussion_r63898835 --- Diff: flink-libraries/flink-gelly/src/test/java/org/apache/flink/graph/library/similarity/JaccardIndexTest.java --- @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.graph.library.similarity; + +import org.apache.commons.math3.random.JDKRandomGenerator; +import org.apache.flink.api.java.DataSet; +import org.apache.flink.api.java.Utils.ChecksumHashCode; +import org.apache.flink.api.java.utils.DataSetUtils; +import org.apache.flink.graph.Graph; +import org.apache.flink.graph.asm.AsmTestBase; +import org.apache.flink.graph.generator.RMatGraph; +import org.apache.flink.graph.generator.random.JDKRandomGeneratorFactory; +import org.apache.flink.graph.generator.random.RandomGenerableFactory; +import org.apache.flink.graph.library.similarity.JaccardIndex.Result; +import org.apache.flink.test.util.TestBaseUtils; +import org.apache.flink.types.IntValue; +import org.apache.flink.types.LongValue; +import org.apache.flink.types.NullValue; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class JaccardIndexTest +extends AsmTestBase { + + @Test + public void testSimpleGraph() + throws Exception { + DataSet<Result<IntValue>> ji = undirectedSimpleGraph + .run(new JaccardIndex<IntValue, NullValue, NullValue>()); + + String expectedResult = + "(0,1,(1,4))\n" + + "(0,2,(1,4))\n" + + "(0,3,(2,4))\n" + + "(1,2,(2,4))\n" + + "(1,3,(1,6))\n" + + "(1,4,(1,3))\n" + + "(1,5,(1,3))\n" + + "(2,3,(1,6))\n" + + "(2,4,(1,3))\n" + + "(2,5,(1,3))\n" + + "(4,5,(1,1))\n"; + + TestBaseUtils.compareResultAsText(ji.collect(), expectedResult); + } + + @Test + public void testSimpleGraphWithMinimumScore() + throws Exception { + DataSet<Result<IntValue>> ji = undirectedSimpleGraph + .run(new JaccardIndex<IntValue, NullValue, NullValue>() + .setMinimumScore(1, 2)); + + String expectedResult = + "(0,3,(2,4))\n" + + "(1,2,(2,4))\n" + + "(4,5,(1,1))\n"; + + TestBaseUtils.compareResultAsText(ji.collect(), expectedResult); + } + + @Test + public void testSimpleGraphWithMaximumScore() + throws Exception { + DataSet<Result<IntValue>> ji = undirectedSimpleGraph + .run(new JaccardIndex<IntValue, NullValue, NullValue>() + .setMaximumScore(1, 2)); + + String expectedResult = + "(0,1,(1,4))\n" + + "(0,2,(1,4))\n" + + "(1,3,(1,6))\n" + + "(1,4,(1,3))\n" + + "(1,5,(1,3))\n" + + "(2,3,(1,6))\n" + + "(2,4,(1,3))\n" + + "(2,5,(1,3))\n"; + + TestBaseUtils.compareResultAsText(ji.collect(), expectedResult); + } + + @Test + public void testCompleteGraph() + throws Exception { + DataSet<Result<LongValue>> ji = completeGraph + .run(new JaccardIndex<LongValue, NullValue, NullValue>() + .setGroupSize(4)); + + for (Result<LongValue> result : ji.collect()) { + // the intersection includes every vertex + assertEquals(completeGraphVertexCount, result.getDistinctNeighborCount().getValue()); + + // the union only excludes the two vertices from the similarity score + assertEquals(completeGraphVertexCount - 2, result.getCommonNeighborCount().getValue()); + } + } + + @Test + public void testRMatGraph() + throws Exception { + long vertexCount = 1 << 8; + long edgeCount = 8 * vertexCount; + + RandomGenerableFactory<JDKRandomGenerator> rnd = new JDKRandomGeneratorFactory(); + + Graph<LongValue, NullValue, NullValue> graph = new RMatGraph<>(env, rnd, vertexCount, edgeCount) + .setSimpleGraph(true, false) + .generate(); + + DataSet<Result<LongValue>> ji = graph + .run(new JaccardIndex<LongValue, NullValue, NullValue>() + .setGroupSize(4)); + + ChecksumHashCode checksum = DataSetUtils.checksumHashCode(ji); + + assertEquals(13954, checksum.getCount()); + assertEquals(0x0000179f83a2a873L, checksum.getChecksum()); --- End diff -- By running the algorithm. > Jaccard Similarity > ------------------ > > Key: FLINK-3780 > URL: https://issues.apache.org/jira/browse/FLINK-3780 > Project: Flink > Issue Type: New Feature > Components: Gelly > Affects Versions: 1.1.0 > Reporter: Greg Hogan > Assignee: Greg Hogan > Fix For: 1.1.0 > > > Implement a Jaccard Similarity algorithm computing all non-zero similarity > scores. This algorithm is similar to {{TriangleListing}} but instead of > joining two-paths against an edge list we count two-paths. > {{flink-gelly-examples}} currently has {{JaccardSimilarityMeasure}} which > relies on {{Graph.getTriplets()}} so only computes similarity scores for > neighbors but not neighbors-of-neighbors. > This algorithm is easily modified for other similarity scores such as > Adamic-Adar similarity where the sum of endpoint degrees is replaced by the > degree of the middle vertex. -- This message was sent by Atlassian JIRA (v6.3.4#6332)