Hello Friends, I hope you are doing fine. Today I am sharing java program to check two strings similarity. Actually yesterday I was working on a project in which I had to find similarity between two strings in percent. There are many algorithms available to do this and some of them are as follows:
- Levenshtein edit distance
- Damerau–Levenshtein distance
- Jaro-Winkler similarity
- Longest common subsequence problem
- Hamming distance
I used Jaro-Winkler’s algorithm in my project because it fits better in my project, but I am sharing Levenshtein edit distance based program also. So, here is the codes and their output below.
StringEqualityPercentCheckLevenshteinDistance.java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
public class StringEqualityPercentCheckLevenshteinDistance { public static void main(String args[]) { printSimilarity("", ""); printSimilarity("", "a"); printSimilarity("aaapppp", ""); printSimilarity("frog", "fog"); printSimilarity("fly", "ant"); printSimilarity("elephant", "hippo"); printSimilarity("hippo", "elephant"); printSimilarity("hippo", "zzzzzzzz"); printSimilarity("hello", "hallo"); printSimilarity("ABC Corporation", "ABC Corp"); printSimilarity("D N H Enterprises Inc", "D & H Enterprises, Inc."); printSimilarity("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"); printSimilarity("PENNSYLVANIA", "PENNCISYLVNIA"); printSimilarity("The quick fox brown jumped", "The quick brown fox"); } public static void printSimilarity(String s, String t) { System.out.println(String.format("%.3f is the similarity between \"%s\" and \"%s\"", similarity(s, t), s, t)); } public static double similarity(String s1, String s2) { String longer = s1, shorter = s2; if (s1.length() < s2.length()) { longer = s2; shorter = s1; } int longerLength = longer.length(); if (longerLength == 0) { return 1.0; /* both strings have zero length */ } return (longerLength - getLevenshteinDistance(longer, shorter)) / (double) longerLength; } /** * LevenshteinDistance * copied from https://commons.apache.org/proper/commons-lang/javadocs/api-2.5/src-html/org/apache/commons/lang/StringUtils.html#line.6162 */ public static int getLevenshteinDistance(String s, String t) { if (s == null || t == null) { throw new IllegalArgumentException("Strings must not be null"); } int n = s.length(); // length of s int m = t.length(); // length of t if (n == 0) { return m; } else if (m == 0) { return n; } if (n > m) { // swap the input strings to consume less memory String tmp = s; s = t; t = tmp; n = m; m = t.length(); } int p[] = new int[n + 1]; //'previous' cost array, horizontally int d[] = new int[n + 1]; // cost array, horizontally int _d[]; //placeholder to assist in swapping p and d // indexes into strings s and t int i; // iterates through s int j; // iterates through t char t_j; // jth character of t int cost; // cost for (i = 0; i <= n; i++) { p[i] = i; } for (j = 1; j <= m; j++) { t_j = t.charAt(j - 1); d[0] = j; for (i = 1; i <= n; i++) { cost = s.charAt(i - 1) == t_j ? 0 : 1; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); } // copy current distance counts to 'previous row' distance counts _d = p; p = d; d = _d; } // our last action in the above loop was to switch d and p, so p now // actually has the most recent cost counts return p[n]; } } |
Output
StringEqualityPercentCheckUsingJaroWinklerDistance.java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import java.util.Arrays; public class StringEqualityPercentCheckUsingJaroWinklerDistance { public static void main(String args[]) { printSimilarity("", ""); printSimilarity("", "a"); printSimilarity("aaapppp", ""); printSimilarity("frog", "fog"); printSimilarity("fly", "ant"); printSimilarity("elephant", "hippo"); printSimilarity("hippo", "elephant"); printSimilarity("hippo", "zzzzzzzz"); printSimilarity("hello", "hallo"); printSimilarity("ABC Corporation", "ABC Corp"); printSimilarity("D N H Enterprises Inc", "D & H Enterprises, Inc."); printSimilarity("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"); printSimilarity("PENNSYLVANIA", "PENNCISYLVNIA"); printSimilarity("The quick fox brown jumped", "The quick brown fox"); } public static void printSimilarity(String s, String t) { System.out.println(String.format("%.3f is the similarity between \"%s\" and \"%s\"", similarity(s, t), s, t)); } /** * JaroWinklerDistance * Copied from https://commons.apache.org/sandbox/commons-text/jacoco/org.apache.commons.text.similarity/JaroWinklerDistance.java.html * apply method changed to similarity */ public static Double similarity(final CharSequence left, final CharSequence right) { final double defaultScalingFactor = 0.1; final double percentageRoundValue = 100.0; if (left == null || right == null) { throw new IllegalArgumentException("Strings must not be null"); } int[] mtp = matches(left, right); double m = mtp[0]; if (m == 0) { return 0D; } double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3; double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j); return Math.round(jw * percentageRoundValue) / percentageRoundValue; } protected static int[] matches(final CharSequence first, final CharSequence second) { CharSequence max, min; if (first.length() > second.length()) { max = first; min = second; } else { max = second; min = first; } int range = Math.max(max.length() / 2 - 1, 0); int[] matchIndexes = new int[min.length()]; Arrays.fill(matchIndexes, -1); boolean[] matchFlags = new boolean[max.length()]; int matches = 0; for (int mi = 0; mi < min.length(); mi++) { char c1 = min.charAt(mi); for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) { if (!matchFlags[xi] && c1 == max.charAt(xi)) { matchIndexes[mi] = xi; matchFlags[xi] = true; matches++; break; } } } char[] ms1 = new char[matches]; char[] ms2 = new char[matches]; for (int i = 0, si = 0; i < min.length(); i++) { if (matchIndexes[i] != -1) { ms1[si] = min.charAt(i); si++; } } for (int i = 0, si = 0; i < max.length(); i++) { if (matchFlags[i]) { ms2[si] = max.charAt(i); si++; } } int transpositions = 0; for (int mi = 0; mi < ms1.length; mi++) { if (ms1[mi] != ms2[mi]) { transpositions++; } } int prefix = 0; for (int mi = 0; mi < min.length(); mi++) { if (first.charAt(mi) == second.charAt(mi)) { prefix++; } else { break; } } return new int[] { matches, transpositions / 2, prefix, max.length() }; } } |
Output
References
Thanks! Please share if you like it
Comments