Hello Bruno, 2015-02-22 1:22 GMT+01:00 <ki...@apache.org>:
> Repository: commons-text > Updated Branches: > refs/heads/NEW-METRICS 0404dbf4b -> ff1959c84 > > > Initial implementation of the cosine distance for strings (not sequences) > > > Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo > Commit: > http://git-wip-us.apache.org/repos/asf/commons-text/commit/ff1959c8 > Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/ff1959c8 > Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/ff1959c8 > > Branch: refs/heads/NEW-METRICS > Commit: ff1959c84dce2eac6f2e8432623d2a2a270a5f32 > Parents: 0404dbf > Author: Bruno P. Kinoshita <ki...@apache.org> > Authored: Sat Feb 21 22:22:28 2015 -0200 > Committer: Bruno P. Kinoshita <ki...@apache.org> > Committed: Sat Feb 21 22:22:28 2015 -0200 > > ---------------------------------------------------------------------- > .../text/similarity/CosineSimilarity.java | 48 +++++++++++++++++- > .../text/similarity/CosineSimilarityTest.java | 51 ++++++++++++++++++++ > 2 files changed, 98 insertions(+), 1 deletion(-) > ---------------------------------------------------------------------- > > > > http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java > ---------------------------------------------------------------------- > diff --git > a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java > b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java > index 4589c2d..ca9d087 100644 > --- > a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java > +++ > b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java > @@ -16,6 +16,52 @@ > */ > package org.apache.commons.text.similarity; > > -public class CosineSimilarity { > +/** > + * <p>Measures the Cosine similarity of two CharSequences. It treats the > CharSequences as > + * two vectors of an inner product space and compares the angle between > them.</p> > + * > + * <p> > + * For further explanation about the Cosine Similarity, take a look at its > + * Wikipedia page at http://en.wikipedia.org/wiki/Cosine_similarity. > + * </p> > + * > + * @since 0.1 > + */ > +public class CosineSimilarity implements StringMetric<Double> { > + > + @Override > + public Double compare(CharSequence left, CharSequence right) { > + if (left == null || right == null) { > + throw new IllegalArgumentException("String parameters must > not be null"); > + } > + long dotProduct = dot(left, right); > + double d1 = 0.0d; > + for (int i = 0; i < left.length(); ++i) { > + d1 += Math.pow(((int) left.charAt(i)), 2); > + } > + double d2 = 0.0d; > + for (int i = 0; i < right.length(); ++i) { > + d2 += Math.pow(((int) right.charAt(i)), 2); > + } > + double cosineSimilarity = dotProduct / (double) (Math.sqrt(d1) * > Math.sqrt(d2)); > + return cosineSimilarity; > + } > + > + /** > + * Computes the dot product of two CharSequences. It ignores > remaining characters. It means > + * that if a string is longer than other, then a smaller part of it > will be used to compute > + * the dot product. > + * > + * @param left left string > + * @param right right string > + * @return the dot product > + */ > + protected long dot(CharSequence left, CharSequence right) { > + long dotProduct = 0; > + for (int i = 0; i < left.length() && i < right.length(); ++i) { > + dotProduct += (((int) left.charAt(i)) * ((int) > right.charAt(i))); > + } > + return dotProduct; > + } > > } > > > http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java > ---------------------------------------------------------------------- > diff --git > a/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java > b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java > new file mode 100644 > index 0000000..aa08057 > --- /dev/null > +++ > b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java > @@ -0,0 +1,51 @@ > +/* > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > +package org.apache.commons.text.similarity; > + > +import static org.junit.Assert.assertEquals; > + > +import java.math.BigDecimal; > +import java.math.RoundingMode; > + > +import org.junit.BeforeClass; > +import org.junit.Test; > + > +/** > + * Unit tests for {@link > org.apache.commons.text.similarity.CosineSimilarity}. > + */ > +public class CosineSimilarityTest { > + > + private static CosineSimilarity cosineSimilarity; > + > + @BeforeClass > + public static void setUp() { > + cosineSimilarity = new CosineSimilarity(); > + } > + > + @Test > + public void testCosineSimilarity() { > + assertEquals(Double.valueOf(0.62d), > roundValue(cosineSimilarity.compare("ABCDE", "AB"))); > + assertEquals(Double.valueOf(1.00d), > roundValue(cosineSimilarity.compare("AB", "AB"))); > I don't understand this test case. Why can't we check for the acutal values? > + } > + > + // --- Utility methods > + > + private Double roundValue(Double value) { > + return (Double) new BigDecimal(value).setScale(2, > RoundingMode.HALF_UP).doubleValue(); > + } > + > +} > > -- http://people.apache.org/~britter/ http://www.systemoutprintln.de/ http://twitter.com/BenediktRitter http://github.com/britter