Repository: commons-text Updated Branches: refs/heads/master 71b59af90 -> c0e8c0210
SANDBOX-483 Incorporate String algorithms from Commons Lang Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/c0e8c021 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/c0e8c021 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/c0e8c021 Branch: refs/heads/master Commit: c0e8c021059d8300f90cf2123528a35d7c468b57 Parents: 71b59af Author: Bruno P. Kinoshita <brunodepau...@yahoo.com.br> Authored: Thu Nov 20 02:23:05 2014 -0200 Committer: Bruno P. Kinoshita <brunodepau...@yahoo.com.br> Committed: Thu Nov 20 02:23:05 2014 -0200 ---------------------------------------------------------------------- .../commons/text/similarity/FuzzyDistance.java | 130 +++++++ .../text/similarity/JaroWrinklerDistance.java | 373 +++++++++++++++++++ .../text/similarity/LevenshteinDistance.java | 258 +++++++++++++ .../commons/text/similarity/StringMetric.java | 45 +++ .../commons/text/similarity/package-info.java | 22 ++ .../text/similarity/TestFuzzyDistance.java | 75 ++++ .../similarity/TestJaroWrinklerDistance.java | 62 +++ .../similarity/TestLevenshteinDistance.java | 146 ++++++++ 8 files changed, 1111 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java new file mode 100644 index 0000000..8e9228a --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/FuzzyDistance.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Locale; + +/** + * <p> + * This string matching algorithm is similar to the algorithms of editors such + * as Sublime Text, TextMate, Atom and others. One point is given for every + * matched character. Subsequent matches yield two bonus points. A higher score + * indicates a higher similarity. + * </p> + * + * @since 1.0 + */ +public class FuzzyDistance implements StringMetric<Integer> { + + /** + * <p> + * Find the Fuzzy Distance which indicates the similarity score between two + * Strings. This method uses the default locale. + * </p> + * + * @param term a full term that should be matched against, must not be null + * @param query the query that will be matched against a term, must not be + * null + * @return result score + * @throws IllegalArgumentException if either String input {@code null} + */ + @Override + public Integer compare(CharSequence term, CharSequence query) { + return compare(term, query, Locale.getDefault()); + } + + /** + * <p> + * Find the Fuzzy Distance which indicates the similarity score between two + * Strings. + * </p> + * + * <pre> + * StringUtils.getFuzzyDistance(null, null, null) = IllegalArgumentException + * StringUtils.getFuzzyDistance("", "", Locale.ENGLISH) = 0 + * StringUtils.getFuzzyDistance("Workshop", "b", Locale.ENGLISH) = 0 + * StringUtils.getFuzzyDistance("Room", "o", Locale.ENGLISH) = 1 + * StringUtils.getFuzzyDistance("Workshop", "w", Locale.ENGLISH) = 1 + * StringUtils.getFuzzyDistance("Workshop", "ws", Locale.ENGLISH) = 2 + * StringUtils.getFuzzyDistance("Workshop", "wo", Locale.ENGLISH) = 4 + * StringUtils.getFuzzyDistance("Apache Software Foundation", "asf", Locale.ENGLISH) = 3 + * </pre> + * + * @param term a full term that should be matched against, must not be null + * @param query the query that will be matched against a term, must not be + * null + * @param locale This string matching logic is case insensitive. A locale is + * necessary to normalize both Strings to lower case. + * @return result score + * @throws IllegalArgumentException if either String input {@code null} or + * Locale input {@code null} + */ + public Integer compare(CharSequence term, CharSequence query, Locale locale) { + if (term == null || query == null) { + throw new IllegalArgumentException("Strings must not be null"); + } else if (locale == null) { + throw new IllegalArgumentException("Locale must not be null"); + } + + // fuzzy logic is case insensitive. We normalize the Strings to lower + // case right from the start. Turning characters to lower case + // via Character.toLowerCase(char) is unfortunately insufficient + // as it does not accept a locale. + final String termLowerCase = term.toString().toLowerCase(locale); + final String queryLowerCase = query.toString().toLowerCase(locale); + + // the resulting score + int score = 0; + + // the position in the term which will be scanned next for potential + // query character matches + int termIndex = 0; + + // index of the previously matched character in the term + int previousMatchingCharacterIndex = Integer.MIN_VALUE; + + for (int queryIndex = 0; queryIndex < queryLowerCase.length(); queryIndex++) { + final char queryChar = queryLowerCase.charAt(queryIndex); + + boolean termCharacterMatchFound = false; + for (; termIndex < termLowerCase.length() + && !termCharacterMatchFound; termIndex++) { + final char termChar = termLowerCase.charAt(termIndex); + + if (queryChar == termChar) { + // simple character matches result in one point + score++; + + // subsequent character matches further improve + // the score. + if (previousMatchingCharacterIndex + 1 == termIndex) { + score += 2; + } + + previousMatchingCharacterIndex = termIndex; + + // we can leave the nested loop. Every character in the + // query can match at most one character in the term. + termCharacterMatchFound = true; + } + } + } + + return score; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java new file mode 100644 index 0000000..2dcd51c --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/JaroWrinklerDistance.java @@ -0,0 +1,373 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * <p> + * The Jaro measure is the weighted sum of percentage of matched characters + * from each file and transposed characters. Winkler increased this measure + * for matching initial characters. + * </p> + * + * <p> + * This implementation is based on the Jaro Winkler similarity algorithm + * from <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"> + * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a>. + * </p> + * + * <p> + * This code has been adapted from Apache Commons Lang 3.3. + * </p> + * + * @since 1.0 + */ +public class JaroWrinklerDistance implements StringMetric<Double> { + + /** + * Represents a failed index search. + */ + public static final int INDEX_NOT_FOUND = -1; + + /** + * <p> + * Find the Jaro Winkler Distance which indicates the similarity score + * between two Strings. + * </p> + * + * <pre> + * StringUtils.getJaroWinklerDistance(null, null) = IllegalArgumentException + * StringUtils.getJaroWinklerDistance("","") = 0.0 + * StringUtils.getJaroWinklerDistance("","a") = 0.0 + * StringUtils.getJaroWinklerDistance("aaapppp", "") = 0.0 + * StringUtils.getJaroWinklerDistance("frog", "fog") = 0.93 + * StringUtils.getJaroWinklerDistance("fly", "ant") = 0.0 + * StringUtils.getJaroWinklerDistance("elephant", "hippo") = 0.44 + * StringUtils.getJaroWinklerDistance("hippo", "elephant") = 0.44 + * StringUtils.getJaroWinklerDistance("hippo", "zzzzzzzz") = 0.0 + * StringUtils.getJaroWinklerDistance("hello", "hallo") = 0.88 + * StringUtils.getJaroWinklerDistance("ABC Corporation", "ABC Corp") = 0.91 + * StringUtils.getJaroWinklerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.93 + * StringUtils.getJaroWinklerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.94 + * StringUtils.getJaroWinklerDistance("PENNSYLVANIA", "PENNCISYLVNIA") = 0.9 + * </pre> + * + * @param left the first String, must not be null + * @param right the second String, must not be null + * @return result distance + * @throws IllegalArgumentException if either String input {@code null} + */ + @Override + public Double compare(CharSequence left, CharSequence right) { + final double DEFAULT_SCALING_FACTOR = 0.1; + + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + final double jaro = score(left, right); + final int cl = commonPrefixLength(left, right); + final double matchScore = Math.round(jaro + (DEFAULT_SCALING_FACTOR + * cl * (1.0 - jaro)) * 100.0) / 100.0; + + return matchScore; + } + + // TODO: we can move these methods to a Util class, keep them here, + // create a common abstract class, shade lang-3.3... + + /** + * Calculates the number of characters from the beginning of the strings + * that match exactly one-to-one, up to a maximum of four (4) characters. + * + * @param first The first string. + * @param second The second string. + * @return A number between 0 and 4. + */ + private static int commonPrefixLength(final CharSequence first, + final CharSequence second) { + final int result = getCommonPrefix(first.toString(), second.toString()) + .length(); + + // Limit the result to 4. + return result > 4 ? 4 : result; + } + + /** + * <p> + * Compares all Strings in an array and returns the initial sequence of + * characters that is common to all of them. + * </p> + * + * <p> + * For example, + * <code>getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) -> "i am a "</code> + * </p> + * + * <pre> + * StringUtils.getCommonPrefix(null) = "" + * StringUtils.getCommonPrefix(new String[] {}) = "" + * StringUtils.getCommonPrefix(new String[] {"abc"}) = "abc" + * StringUtils.getCommonPrefix(new String[] {null, null}) = "" + * StringUtils.getCommonPrefix(new String[] {"", ""}) = "" + * StringUtils.getCommonPrefix(new String[] {"", null}) = "" + * StringUtils.getCommonPrefix(new String[] {"abc", null, null}) = "" + * StringUtils.getCommonPrefix(new String[] {null, null, "abc"}) = "" + * StringUtils.getCommonPrefix(new String[] {"", "abc"}) = "" + * StringUtils.getCommonPrefix(new String[] {"abc", ""}) = "" + * StringUtils.getCommonPrefix(new String[] {"abc", "abc"}) = "abc" + * StringUtils.getCommonPrefix(new String[] {"abc", "a"}) = "a" + * StringUtils.getCommonPrefix(new String[] {"ab", "abxyz"}) = "ab" + * StringUtils.getCommonPrefix(new String[] {"abcde", "abxyz"}) = "ab" + * StringUtils.getCommonPrefix(new String[] {"abcde", "xyz"}) = "" + * StringUtils.getCommonPrefix(new String[] {"xyz", "abcde"}) = "" + * StringUtils.getCommonPrefix(new String[] {"i am a machine", "i am a robot"}) = "i am a " + * </pre> + * + * @param strs array of String objects, entries may be null + * @return the initial sequence of characters that are common to all Strings + * in the array; empty String if the array is null, the elements are + * all null or if there is no common prefix. + * @since 2.4 + */ + public static String getCommonPrefix(final String... strs) { + if (strs == null || strs.length == 0) { + return ""; + } + final int smallestIndexOfDiff = indexOfDifference(strs); + if (smallestIndexOfDiff == INDEX_NOT_FOUND) { + // all strings were identical + if (strs[0] == null) { + return ""; + } + return strs[0]; + } else if (smallestIndexOfDiff == 0) { + // there were no common initial characters + return ""; + } else { + // we found a common initial character sequence + return strs[0].substring(0, smallestIndexOfDiff); + } + } + + /** + * This method returns the Jaro-Winkler score for string matching. + * + * @param first the first string to be matched + * @param second the second string to be machted + * @return matching score without scaling factor impact + */ + protected static double score(final CharSequence first, + final CharSequence second) { + String shorter; + String longer; + + // Determine which String is longer. + if (first.length() > second.length()) { + longer = first.toString().toLowerCase(); + shorter = second.toString().toLowerCase(); + } else { + longer = second.toString().toLowerCase(); + shorter = first.toString().toLowerCase(); + } + + // Calculate the half length() distance of the shorter String. + final int halflength = shorter.length() / 2 + 1; + + // Find the set of matching characters between the shorter and longer + // strings. Note that + // the set of matching characters may be different depending on the + // order of the strings. + final String m1 = getSetOfMatchingCharacterWithin(shorter, longer, + halflength); + final String m2 = getSetOfMatchingCharacterWithin(longer, shorter, + halflength); + + // If one or both of the sets of common characters is empty, then + // there is no similarity between the two strings. + if (m1.length() == 0 || m2.length() == 0) { + return 0.0; + } + + // If the set of common characters is not the same size, then + // there is no similarity between the two strings, either. + if (m1.length() != m2.length()) { + return 0.0; + } + + // Calculate the number of transposition between the two sets + // of common characters. + final int transpositions = transpositions(m1, m2); + + // Calculate the distance. + final double dist = (m1.length() / ((double) shorter.length()) + + m2.length() / ((double) longer.length()) + (m1.length() - transpositions) + / ((double) m1.length())) / 3.0; + return dist; + } + + /** + * Calculates the number of transposition between two strings. + * + * @param first The first string. + * @param second The second string. + * @return The number of transposition between the two strings. + */ + protected static int transpositions(final CharSequence first, + final CharSequence second) { + int transpositions = 0; + for (int i = 0; i < first.length(); i++) { + if (first.charAt(i) != second.charAt(i)) { + transpositions++; + } + } + return transpositions / 2; + } + + /** + * <p> + * Compares all CharSequences in an array and returns the index at which the + * CharSequences begin to differ. + * </p> + * + * <p> + * For example, + * <code>indexOfDifference(new String[] {"i am a machine", "i am a robot"}) -> 7</code> + * </p> + * + * <pre> + * StringUtils.indexOfDifference(null) = -1 + * StringUtils.indexOfDifference(new String[] {}) = -1 + * StringUtils.indexOfDifference(new String[] {"abc"}) = -1 + * StringUtils.indexOfDifference(new String[] {null, null}) = -1 + * StringUtils.indexOfDifference(new String[] {"", ""}) = -1 + * StringUtils.indexOfDifference(new String[] {"", null}) = 0 + * StringUtils.indexOfDifference(new String[] {"abc", null, null}) = 0 + * StringUtils.indexOfDifference(new String[] {null, null, "abc"}) = 0 + * StringUtils.indexOfDifference(new String[] {"", "abc"}) = 0 + * StringUtils.indexOfDifference(new String[] {"abc", ""}) = 0 + * StringUtils.indexOfDifference(new String[] {"abc", "abc"}) = -1 + * StringUtils.indexOfDifference(new String[] {"abc", "a"}) = 1 + * StringUtils.indexOfDifference(new String[] {"ab", "abxyz"}) = 2 + * StringUtils.indexOfDifference(new String[] {"abcde", "abxyz"}) = 2 + * StringUtils.indexOfDifference(new String[] {"abcde", "xyz"}) = 0 + * StringUtils.indexOfDifference(new String[] {"xyz", "abcde"}) = 0 + * StringUtils.indexOfDifference(new String[] {"i am a machine", "i am a robot"}) = 7 + * </pre> + * + * @param css array of CharSequences, entries may be null + * @return the index where the strings begin to differ; -1 if they are all + * equal + * @since 2.4 + * @since 3.0 Changed signature from indexOfDifference(String...) to + * indexOfDifference(CharSequence...) + */ + protected static int indexOfDifference(final CharSequence... css) { + if (css == null || css.length <= 1) { + return INDEX_NOT_FOUND; + } + boolean anyStringNull = false; + boolean allStringsNull = true; + final int arrayLen = css.length; + int shortestStrLen = Integer.MAX_VALUE; + int longestStrLen = 0; + + // find the min and max string lengths; this avoids checking to make + // sure we are not exceeding the length of the string each time through + // the bottom loop. + for (int i = 0; i < arrayLen; i++) { + if (css[i] == null) { + anyStringNull = true; + shortestStrLen = 0; + } else { + allStringsNull = false; + shortestStrLen = Math.min(css[i].length(), shortestStrLen); + longestStrLen = Math.max(css[i].length(), longestStrLen); + } + } + + // handle lists containing all nulls or all empty strings + if (allStringsNull || longestStrLen == 0 && !anyStringNull) { + return INDEX_NOT_FOUND; + } + + // handle lists containing some nulls or some empty strings + if (shortestStrLen == 0) { + return 0; + } + + // find the position with the first difference across all strings + int firstDiff = -1; + for (int stringPos = 0; stringPos < shortestStrLen; stringPos++) { + final char comparisonChar = css[0].charAt(stringPos); + for (int arrayPos = 1; arrayPos < arrayLen; arrayPos++) { + if (css[arrayPos].charAt(stringPos) != comparisonChar) { + firstDiff = stringPos; + break; + } + } + if (firstDiff != -1) { + break; + } + } + + if (firstDiff == -1 && shortestStrLen != longestStrLen) { + // we compared all of the characters up to the length of the + // shortest string and didn't find a match, but the string lengths + // vary, so return the length of the shortest string. + return shortestStrLen; + } + return firstDiff; + } + + /** + * Gets a set of matching characters between two strings. + * + * <p> + * Two characters from the first string and the second string are + * considered matching if the character's respective positions are no + * farther than the limit value. + * </p> + * + * @param first The first string. + * @param second The second string. + * @param limit The maximum distance to consider. + * @return A string contain the set of common characters. + */ + protected static String getSetOfMatchingCharacterWithin( + final CharSequence first, final CharSequence second, final int limit) { + final StringBuilder common = new StringBuilder(); + final StringBuilder copy = new StringBuilder(second); + + for (int i = 0; i < first.length(); i++) { + final char ch = first.charAt(i); + boolean found = false; + + // See if the character is within the limit positions away from the + // original position of that character. + for (int j = Math.max(0, i - limit); !found + && j < Math.min(i + limit, second.length()); j++) { + if (copy.charAt(j) == ch) { + found = true; + common.append(ch); + copy.setCharAt(j, '*'); + } + } + } + return common.toString(); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java new file mode 100644 index 0000000..1793f1e --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Arrays; + +/** + * <p> + * A string metric for measuring the difference between two sequences. + * </p> + * + * <p> + * This code has been adapted from Apache Commons Lang 3.3. + * </p> + * + * @since 1.0 + */ +public class LevenshteinDistance implements StringMetric<Integer> { + + /** + * <p> + * Find the Levenshtein distance between two Strings. + * </p> + * + * <p> + * This is the number of changes needed to change one String into another, + * where each change is a single character modification (deletion, insertion + * or substitution). + * </p> + * + * <p> + * The previous implementation of the Levenshtein distance algorithm was + * from <a + * href="http://www.merriampark.com/ld.htm">http://www.merriampark.com + * /ld.htm</a> + * </p> + * + * <p> + * Chas Emerick has written an implementation in Java, which avoids an + * OutOfMemoryError which can occur when my Java implementation is used with + * very large strings.<br> + * This implementation of the Levenshtein distance algorithm is from <a + * href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ + * ldjava.htm</a> + * </p> + * + * <pre> + * StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException + * StringUtils.getLevenshteinDistance(*, null) = IllegalArgumentException + * StringUtils.getLevenshteinDistance("","") = 0 + * StringUtils.getLevenshteinDistance("","a") = 1 + * StringUtils.getLevenshteinDistance("aaapppp", "") = 7 + * StringUtils.getLevenshteinDistance("frog", "fog") = 1 + * StringUtils.getLevenshteinDistance("fly", "ant") = 3 + * StringUtils.getLevenshteinDistance("elephant", "hippo") = 7 + * StringUtils.getLevenshteinDistance("hippo", "elephant") = 7 + * StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz") = 8 + * StringUtils.getLevenshteinDistance("hello", "hallo") = 1 + * </pre> + * + * @param left the first string, must not be null + * @param right the second string, must not be null + * @return result distance + * @throws IllegalArgumentException if either String input {@code null} + */ + @Override + public Integer compare(CharSequence left, CharSequence right) { + return compare(left, right, Integer.MAX_VALUE); + } + + /** + * <p> + * Find the Levenshtein distance between two Strings if it's less than or + * equal to a given threshold. + * </p> + * + * <p> + * This is the number of changes needed to change one String into another, + * where each change is a single character modification (deletion, insertion + * or substitution). + * </p> + * + * <p> + * This implementation follows from Algorithms on Strings, Trees and + * Sequences by Dan Gusfield and Chas Emerick's implementation of the + * Levenshtein distance algorithm from <a + * href="http://www.merriampark.com/ld.htm" + * >http://www.merriampark.com/ld.htm</a> + * </p> + * + * <pre> + * StringUtils.getLevenshteinDistance(null, *, *) = IllegalArgumentException + * StringUtils.getLevenshteinDistance(*, null, *) = IllegalArgumentException + * StringUtils.getLevenshteinDistance(*, *, -1) = IllegalArgumentException + * StringUtils.getLevenshteinDistance("","", 0) = 0 + * StringUtils.getLevenshteinDistance("aaapppp", "", 8) = 7 + * StringUtils.getLevenshteinDistance("aaapppp", "", 7) = 7 + * StringUtils.getLevenshteinDistance("aaapppp", "", 6)) = -1 + * StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7 + * StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1 + * StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7 + * StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1 + * </pre> + * + * @param left the first string, must not be null + * @param right the second string, must not be null + * @param threshold the target threshold, must not be negative + * @return result distance + * @throws IllegalArgumentException if either String input {@code null} or + * negative threshold + */ + public Integer compare(CharSequence left, CharSequence right, int threshold) { + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + if (threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + + /* + * This implementation only computes the distance if it's less than or + * equal to the threshold value, returning -1 if it's greater. The + * advantage is performance: unbounded distance is O(nm), but a bound of + * k allows us to reduce it to O(km) time by only computing a diagonal + * stripe of width 2k + 1 of the cost table. It is also possible to use + * this to compute the unbounded Levenshtein distance by starting the + * threshold at 1 and doubling each time until the distance is found; + * this is O(dm), where d is the distance. + * + * One subtlety comes from needing to ignore entries on the border of + * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry + * to the left of the leftmost member We must ignore the entry above the + * rightmost member + * + * Another subtlety comes from our stripe running off the matrix if the + * strings aren't of the same size. Since string s is always swapped to + * be the shorter of the two, the stripe will always run off to the + * upper right instead of the lower left of the matrix. + * + * As a concrete example, suppose s is of length 5, t is of length 7, + * and our threshold is 1. In this case we're going to walk a stripe of + * length 3. The matrix would look like so: + * + * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | + * | | |#|#| 6 | | | | |#| 7 | | | | | | + * + * Note how the stripe leads off the table as there is no possible way + * to turn a string of length 5 into one of length 7 in edit distance of + * 1. + * + * Additionally, this implementation decreases memory usage by using two + * single-dimensional arrays and swapping them back and forth instead of + * allocating an entire n by m matrix. This requires a few minor + * changes, such as immediately returning when it's detected that the + * stripe has run off the matrix and initially filling the arrays with + * large values so that entries we don't compute are ignored. + * + * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for + * some discussion. + */ + + int n = left.length(); // length of s + int m = right.length(); // length of t + + // if one string is empty, the edit distance is necessarily the length + // of the other + if (n == 0) { + return m <= threshold ? m : -1; + } else if (m == 0) { + return n <= threshold ? n : -1; + } + + if (n > m) { + // swap the two strings to consume less memory + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); + } + + int[] p = new int[n + 1]; // 'previous' cost array, horizontally + int[] d = new int[n + 1]; // cost array, horizontally + int[] _d; // placeholder to assist in swapping p and d + + // fill in starting table values + final int boundary = Math.min(n, threshold) + 1; + for (int i = 0; i < boundary; i++) { + p[i] = i; + } + // these fills ensure that the value above the rightmost entry of our + // stripe will be ignored in following loop iterations + Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); + Arrays.fill(d, Integer.MAX_VALUE); + + // iterates through t + for (int j = 1; j <= m; j++) { + final char t_j = right.charAt(j - 1); // jth character of t + d[0] = j; + + // compute stripe indices, constrain to array size + final int min = Math.max(1, j - threshold); + final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( + n, j + threshold); + + // the stripe may lead off of the table if s and t are of different + // sizes + if (min > max) { + return -1; + } + + // ignore entry left of leftmost + if (min > 1) { + d[min - 1] = Integer.MAX_VALUE; + } + + // iterates through [min, max] in s + for (int i = min; i <= max; i++) { + if (left.charAt(i - 1) == t_j) { + // diagonally left and up + d[i] = p[i - 1]; + } else { + // 1 + minimum of cell to the left, to the top, diagonally + // left and up + d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); + } + } + + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; + } + + // if p[n] is greater than the threshold, there's no guarantee on it + // being the correct + // distance + if (p[n] <= threshold) { + return p[n]; + } + return -1; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/StringMetric.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/StringMetric.java b/src/main/java/org/apache/commons/text/similarity/StringMetric.java new file mode 100644 index 0000000..6765bbd --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/StringMetric.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * <p> + * Marker interface for <a + * href='http://en.wikipedia.org/wiki/String_metric'>String Metrics</a> + * </p> + * + * <p> + * A string metric measures the distance between two text strings. + * </p> + * + * @param <R> return type of the edit distance + * @since 1.0 + */ +public interface StringMetric<R> { + + /** + * <p> + * Compares two strings. + * </p> + * + * @param left the first string + * @param right the second string + * @return result distance + */ + R compare(CharSequence left, CharSequence right); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/main/java/org/apache/commons/text/similarity/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/package-info.java b/src/main/java/org/apache/commons/text/similarity/package-info.java new file mode 100644 index 0000000..8e9d478 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * <p>Provides algorithms for string similarity.</p> + * + * @since 1.0 + */ +package org.apache.commons.text.similarity; http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java new file mode 100644 index 0000000..30b1dfc --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/TestFuzzyDistance.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.junit.Assert.assertEquals; + +import java.util.Locale; + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Unit tests for {@link org.apache.commons.text.FuzzyDistance}. + */ +public class TestFuzzyDistance { + + private static FuzzyDistance distance; + + @BeforeClass + public static void setUp() { + distance = new FuzzyDistance(); + } + + @Test + public void testGetFuzzyDistance() throws Exception { + assertEquals(0, (int) distance.compare("", "", Locale.ENGLISH)); + assertEquals(0, + (int) distance.compare("Workshop", "b", Locale.ENGLISH)); + assertEquals(1, + (int) distance.compare("Room", "o", Locale.ENGLISH)); + assertEquals(1, + (int) distance.compare("Workshop", "w", Locale.ENGLISH)); + assertEquals(2, + (int) distance.compare("Workshop", "ws", Locale.ENGLISH)); + assertEquals(4, + (int) distance.compare("Workshop", "wo", Locale.ENGLISH)); + assertEquals(3, (int) distance.compare( + "Apache Software Foundation", "asf", Locale.ENGLISH)); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetFuzzyDistance_NullNullNull() throws Exception { + distance.compare(null, null, null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetFuzzyDistance_StringNullLoclae() throws Exception { + distance.compare(" ", null, Locale.ENGLISH); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetFuzzyDistance_NullStringLocale() throws Exception { + distance.compare(null, "clear", Locale.ENGLISH); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetFuzzyDistance_StringStringNull() throws Exception { + distance.compare(" ", "clear", null); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java new file mode 100644 index 0000000..6477de0 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/TestJaroWrinklerDistance.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.junit.Assert.assertEquals; + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Unit tests for {@link org.apache.commons.text.JaroWrinklerDistance}. + */ +public class TestJaroWrinklerDistance { + + private static JaroWrinklerDistance distance; + + @BeforeClass + public static void setUp() { + distance = new JaroWrinklerDistance(); + } + + @Test + public void testGetJaroWinklerDistance_StringString() { + assertEquals(0.93d, (double) distance.compare("frog", "fog"), 0.0d); + assertEquals(0.0d, (double) distance.compare("fly", "ant"), 0.0d); + assertEquals(0.44d, (double) distance.compare("elephant", "hippo"), 0.0d); + assertEquals(0.91d, (double) distance.compare("ABC Corporation", "ABC Corp"), 0.0d); + assertEquals(0.93d, (double) distance.compare("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.0d); + assertEquals(0.94d, (double) distance.compare("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.0d); + assertEquals(0.9d, (double) distance.compare("PENNSYLVANIA", "PENNCISYLVNIA"), 0.0d); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetJaroWinklerDistance_NullNull() throws Exception { + distance.compare(null, null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetJaroWinklerDistance_StringNull() throws Exception { + distance.compare(" ", null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetJaroWinklerDistance_NullString() throws Exception { + distance.compare(null, "clear"); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/c0e8c021/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java new file mode 100644 index 0000000..7790b05 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/TestLevenshteinDistance.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.junit.Assert.assertEquals; + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}. + */ +public class TestLevenshteinDistance { + + private static LevenshteinDistance distance; + + @BeforeClass + public static void setUp() { + distance = new LevenshteinDistance(); + } + + @Test + public void testGetLevenshteinDistance_StringString() { + assertEquals(0, (int) distance.compare("", "")); + assertEquals(1, (int) distance.compare("", "a")); + assertEquals(7, (int) distance.compare("aaapppp", "")); + assertEquals(1, (int) distance.compare("frog", "fog")); + assertEquals(3, (int) distance.compare("fly", "ant")); + assertEquals(7, (int) distance.compare("elephant", "hippo")); + assertEquals(7, (int) distance.compare("hippo", "elephant")); + assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz")); + assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo")); + assertEquals(1, (int) distance.compare("hello", "hallo")); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetLevenshteinDistance_NullString() throws Exception { + distance.compare("a", null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetLevenshteinDistance_StringNull() throws Exception { + distance.compare(null, "a"); + } + + @Test + public void testGetLevenshteinDistance_StringStringInt() { + // empty strings + assertEquals(0, (int) distance.compare("", "", 0)); + assertEquals(7, (int) distance.compare("aaapppp", "", 8)); + assertEquals(7, (int) distance.compare("aaapppp", "", 7)); + assertEquals(-1, (int) distance.compare("aaapppp", "", 6)); + + // unequal strings, zero threshold + assertEquals(-1, (int) distance.compare("b", "a", 0)); + assertEquals(-1, (int) distance.compare("a", "b", 0)); + + // equal strings + assertEquals(0, (int) distance.compare("aa", "aa", 0)); + assertEquals(0, (int) distance.compare("aa", "aa", 2)); + + // same length + assertEquals(-1, (int) distance.compare("aaa", "bbb", 2)); + assertEquals(3, (int) distance.compare("aaa", "bbb", 3)); + + // big stripe + assertEquals(6, (int) distance.compare("aaaaaa", "b", 10)); + + // distance less than threshold + assertEquals(7, (int) distance.compare("aaapppp", "b", 8)); + assertEquals(3, (int) distance.compare("a", "bbb", 4)); + + // distance equal to threshold + assertEquals(7, (int) distance.compare("aaapppp", "b", 7)); + assertEquals(3, (int) distance.compare("a", "bbb", 3)); + + // distance greater than threshold + assertEquals(-1, (int) distance.compare("a", "bbb", 2)); + assertEquals(-1, (int) distance.compare("bbb", "a", 2)); + assertEquals(-1, (int) distance.compare("aaapppp", "b", 6)); + + // stripe runs off array, strings not similar + assertEquals(-1, (int) distance.compare("a", "bbb", 1)); + assertEquals(-1, (int) distance.compare("bbb", "a", 1)); + + // stripe runs off array, strings are similar + assertEquals(-1, (int) distance.compare("12345", "1234567", 1)); + assertEquals(-1, (int) distance.compare("1234567", "12345", 1)); + + // old getLevenshteinDistance test cases + assertEquals(1, (int) distance.compare("frog", "fog", 1)); + assertEquals(3, (int) distance.compare("fly", "ant", 3)); + assertEquals(7, (int) distance.compare("elephant", "hippo", 7)); + assertEquals(-1, (int) distance.compare("elephant", "hippo", 6)); + assertEquals(7, (int) distance.compare("hippo", "elephant", 7)); + assertEquals(-1, (int) distance.compare("hippo", "elephant", 6)); + assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8)); + assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8)); + assertEquals(1, (int) distance.compare("hello", "hallo", 1)); + + assertEquals(1, + (int) distance.compare("frog", "fog", Integer.MAX_VALUE)); + assertEquals(3, (int) distance.compare("fly", "ant", Integer.MAX_VALUE)); + assertEquals(7, + (int) distance.compare("elephant", "hippo", Integer.MAX_VALUE)); + assertEquals(7, + (int) distance.compare("hippo", "elephant", Integer.MAX_VALUE)); + assertEquals(8, + (int) distance.compare("hippo", "zzzzzzzz", Integer.MAX_VALUE)); + assertEquals(8, + (int) distance.compare("zzzzzzzz", "hippo", Integer.MAX_VALUE)); + assertEquals(1, + (int) distance.compare("hello", "hallo", Integer.MAX_VALUE)); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetLevenshteinDistance_NullStringInt() throws Exception { + distance.compare(null, "a", 0); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetLevenshteinDistance_StringNullInt() throws Exception { + distance.compare("a", null, 0); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetLevenshteinDistance_StringStringNegativeInt() + throws Exception { + distance.compare("a", "a", -1); + } + +} --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@commons.apache.org For additional commands, e-mail: dev-h...@commons.apache.org